From 0577970fee5ec903fc7417fbdd394134d636d722 Mon Sep 17 00:00:00 2001 From: yanghaoran Date: Tue, 5 May 2020 21:14:48 +0800 Subject: [PATCH] Update GraphEngine to synchronize with latest Ascend driver software suite 4 May 2020 --- inc/common/blocking_queue.h | 3 +- inc/common/dynamic_aipp.h | 16 +- inc/common/npu_error_define.h | 8 +- inc/common/opskernel/ge_task_info.h | 3 +- inc/common/opskernel/ops_kernel_info_store.h | 16 +- inc/common/opskernel/ops_kernel_info_types.h | 1 + inc/common/optimizer/graph_optimizer.h | 9 +- inc/common/optimizer/graph_optimizer_types.h | 4 +- inc/external/ge/ge_api_types.h | 53 +- inc/external/ge/ge_ir_build.h | 75 + inc/external/graph/attr_value.h | 2 +- inc/external/graph/graph.h | 2 +- inc/external/graph/inference_context.h | 6 +- inc/external/graph/operator.h | 6 +- inc/external/graph/operator_factory.h | 4 +- inc/external/graph/operator_reg.h | 46 +- inc/external/graph/tensor.h | 4 +- inc/external/graph/types.h | 4 +- inc/external/register/register.h | 35 + inc/external/register/register_error_codes.h | 3 +- inc/external/register/register_types.h | 4 + inc/framework/common/debug/ge_log.h | 67 +- inc/framework/common/debug/log.h | 155 +- inc/framework/common/ge_inner_error_codes.h | 8 +- inc/framework/common/ge_types.h | 34 +- inc/framework/common/helper/model_helper.h | 42 +- inc/framework/common/helper/om_file_helper.h | 9 +- inc/framework/common/l2_cache_optimize.h | 2 +- inc/framework/common/op/attr_define.h | 4 + inc/framework/common/op/attr_value_util.h | 5 +- inc/framework/common/op/ge_op_utils.h | 27 +- inc/framework/common/op/op_parser_util.h | 4 +- inc/framework/common/scope_guard.h | 2 +- inc/framework/common/types.h | 23 +- inc/framework/common/util.h | 164 +- inc/framework/dlog/log.h | 4 +- inc/framework/engine/dnnengine.h | 2 +- inc/framework/executor/ge_executor.h | 70 +- inc/framework/generator/ge_generator.h | 6 +- inc/framework/generator/generator_api.h | 1 - inc/framework/memory/memory_assigner.h | 4 +- inc/framework/omg/omg_inner_types.h | 33 +- inc/framework/omg/version.h | 8 - inc/graph/anchor.h | 26 +- inc/graph/attr_value_serializable.h | 13 +- inc/graph/buffer.h | 5 +- inc/graph/compute_graph.h | 77 +- inc/graph/debug/ge_attr_define.h | 322 ++- inc/graph/def_types.h | 9 +- inc/graph/detail/attributes_holder.h | 3 +- inc/graph/detail/model_serialize_imp.h | 10 +- inc/graph/ge_attr_value.h | 6 +- inc/graph/ge_context.h | 3 +- inc/graph/ge_local_context.h | 2 - inc/graph/ge_tensor.h | 22 +- inc/graph/model.h | 3 +- inc/graph/model_serialize.h | 3 +- inc/graph/node.h | 14 +- inc/graph/op_desc.h | 27 +- inc/graph/operator_factory_impl.h | 2 - inc/graph/shape_refiner.h | 6 +- inc/graph/usr_types.h | 6 +- inc/graph/utils/attr_utils.h | 5 +- inc/graph/utils/graph_utils.h | 325 ++- inc/graph/utils/node_utils.h | 5 + inc/graph/utils/op_desc_utils.h | 127 +- inc/graph/utils/tensor_utils.h | 15 +- src/common/graph/CMakeLists.txt | 1 + src/common/graph/anchor.cc | 2 + src/common/graph/buffer.cc | 3 +- src/common/graph/compute_graph.cc | 223 +- src/common/graph/debug/ge_log.h | 104 +- src/common/graph/debug/ge_util.h | 1 - src/common/graph/debug/graph_debug.cc | 2 - src/common/graph/debug/graph_debug.h | 2 - src/common/graph/detail/attributes_holder.cc | 2 - src/common/graph/format_refiner.cc | 35 +- src/common/graph/ge_attr_define.cc | 281 ++- src/common/graph/ge_attr_value.cc | 49 +- src/common/graph/ge_tensor.cc | 34 +- src/common/graph/model.cc | 17 +- src/common/graph/model_serialize.cc | 37 +- src/common/graph/node.cc | 7 +- src/common/graph/op_desc.cc | 193 +- src/common/graph/op_imp.cc | 3 +- src/common/graph/operator.cc | 53 +- src/common/graph/opsproto/opsproto_manager.cc | 6 +- src/common/graph/option/ge_context.cc | 2 +- src/common/graph/shape_refiner.cc | 150 +- src/common/graph/tensor.cc | 8 +- src/common/graph/utils/ge_ir_utils.cc | 222 +- src/common/graph/utils/ge_ir_utils.h | 14 + src/common/graph/utils/graph_utils.cc | 944 +++++++- src/common/graph/utils/node_utils.cc | 44 + src/common/graph/utils/op_desc_utils.cc | 83 +- src/common/graph/utils/tensor_utils.cc | 25 +- src/common/graph/utils/type_utils.cc | 317 +-- src/ge/CMakeLists.txt | 73 +- src/ge/client/CMakeLists.txt | 2 +- src/ge/client/ge_api.cc | 42 +- src/ge/common/CMakeLists.txt | 9 +- src/ge/common/auth/file_saver.cc | 96 +- src/ge/common/auth/file_saver.h | 54 +- src/ge/common/convert/pb2json.cc | 38 +- src/ge/common/convert/pb2json.h | 2 + src/ge/common/debug/memory_dumper.cc | 22 +- src/ge/common/debug/memory_dumper.h | 2 +- .../format_transfers/datatype_transfer.cc | 2 + .../format_transfer_c1hwncoc0_hwcn.cc | 20 +- .../format_transfer_dhwcn_fracz3D.cc | 179 ++ .../format_transfer_dhwcn_fracz3D.h | 34 + ...format_transfer_dhwnc_fracz3D_transpose.cc | 180 ++ .../format_transfer_dhwnc_fracz3D_transpose.h | 34 + .../format_transfer_hwcn_c1hwncoc0.cc | 24 +- .../format_transfer_nchw_fz_c04.cc | 306 +++ .../format_transfer_nchw_fz_c04.h | 35 + .../formats/utils/formats_definitions.h | 45 +- src/ge/common/fp16_t.cc | 1213 +++++++++-- src/ge/common/fp16_t.h | 826 ++++--- src/ge/common/ge/plugin_manager.cc | 1 + src/ge/common/ge_format_util.cc | 1 - src/ge/common/helper/model_helper.cc | 38 +- src/ge/common/helper/om_file_helper.cc | 31 +- src/ge/common/math/fp16_math.cc | 171 ++ src/ge/common/math/fp16_math.h | 96 + src/ge/common/math/math_util.h | 687 +++++- src/ge/common/math_util.h | 4 +- src/ge/common/model_parser/base.cc | 12 +- src/ge/common/model_parser/base.h | 56 +- src/ge/common/model_saver.cc | 6 +- src/ge/common/op/attr_define.cc | 6 +- src/ge/common/op/attr_value_util.cc | 56 +- src/ge/common/op/ge_op_utils.cc | 172 +- src/ge/common/profiling/profiling_manager.cc | 225 +- src/ge/common/profiling/profiling_manager.h | 30 +- src/ge/common/properties_manager.cc | 33 +- src/ge/common/properties_manager.h | 4 + src/ge/common/thread_pool.h | 4 +- src/ge/common/types.cc | 261 ++- src/ge/common/util.cc | 62 +- src/ge/engine_manager/dnnengine_manager.cc | 15 +- src/ge/engine_manager/dnnengine_manager.h | 4 +- src/ge/executor/CMakeLists.txt | 4 + src/ge/executor/ge_executor.cc | 520 +++-- src/ge/ge_local_engine/CMakeLists.txt | 2 +- .../ge_local_engine/engine/host_cpu_engine.cc | 249 +++ .../ge_local_engine/engine/host_cpu_engine.h | 78 + .../ge_local_ops_kernel_info.cc | 28 +- .../ops_kernel_store/op/ge_deleted_op.cc | 1 + .../ops_kernel_store/op/no_op.cc | 13 + src/ge/ge_runtime/runtime_model.cc | 4 + src/ge/ge_runtime/task/aicpu_task.cc | 1 + src/ge/ge_runtime/task/cce_task.cc | 2 + src/ge/ge_runtime/task/tbe_task.cc | 1 + src/ge/generator/ge_generator.cc | 33 +- .../{graph_build.cc => graph_builder.cc} | 77 +- .../build/{graph_build.h => graph_builder.h} | 1 + src/ge/graph/build/label_allocator.cc | 80 + src/ge/graph/build/label_allocator.h | 39 + .../graph/build/logical_stream_allocator.cc | 106 +- src/ge/graph/build/logical_stream_allocator.h | 5 + .../graph/build/memory/block_mem_assigner.cc | 282 ++- .../graph/build/memory/block_mem_assigner.h | 33 +- .../graph/build/memory/graph_mem_assigner.cc | 511 ++++- .../graph/build/memory/graph_mem_assigner.h | 13 +- src/ge/graph/build/memory/memory_assigner.cc | 27 +- .../graph/build/memory/var_mem_assign_util.cc | 5 +- src/ge/graph/build/model_builder.cc | 90 +- src/ge/graph/build/model_builder.h | 7 +- src/ge/graph/build/run_context.cc | 46 +- src/ge/graph/build/run_context.h | 16 +- src/ge/graph/build/stream_allocator.cc | 43 +- ...eam_graph.cc => stream_graph_optimizer.cc} | 34 +- ...tream_graph.h => stream_graph_optimizer.h} | 10 +- src/ge/graph/build/task_generator.cc | 246 ++- src/ge/graph/build/task_generator.h | 31 +- src/ge/graph/common/bcast.h | 19 - src/ge/graph/common/omg_util.cc | 5 +- src/ge/graph/common/transop_util.cc | 4 +- src/ge/graph/execute/graph_execute.cc | 82 +- src/ge/graph/execute/graph_execute.h | 9 + src/ge/graph/label/case_label_maker.cc | 130 ++ src/ge/graph/label/case_label_maker.h | 94 + src/ge/graph/label/if_label_maker.cc | 118 + src/ge/graph/label/if_label_maker.h | 80 + src/ge/graph/label/label_maker.cc | 392 ++++ src/ge/graph/label/label_maker.h | 65 + src/ge/graph/label/label_maker_factory.h | 89 + .../label/partitioned_call_label_maker.cc | 58 + .../label/partitioned_call_label_maker.h | 66 + src/ge/graph/label/while_label_maker.cc | 126 ++ src/ge/graph/label/while_label_maker.h | 80 + .../new_model_manager/cpu_queue_schedule.cc | 372 ++++ .../new_model_manager/cpu_queue_schedule.h | 168 ++ .../load/new_model_manager/data_dumper.cc | 90 +- .../load/new_model_manager/data_dumper.h | 7 +- .../load/new_model_manager/data_inputer.h | 2 +- .../load/new_model_manager/davinci_model.cc | 1908 +++++++++++++---- .../load/new_model_manager/davinci_model.h | 339 ++- .../new_model_manager/davinci_model_parser.cc | 2 +- .../load/new_model_manager/model_manager.cc | 68 +- .../load/new_model_manager/model_manager.h | 11 +- .../load/new_model_manager/model_output.cc | 3 +- .../load/new_model_manager/model_utils.cc | 171 +- .../load/new_model_manager/model_utils.h | 15 +- .../task_info/end_graph_task_info.cc | 35 +- .../task_info/end_graph_task_info.h | 5 + .../task_info/fusion_start_task_info.cc | 1 + .../task_info/fusion_stop_task_info.cc | 1 + .../task_info/hccl_task_info.cc | 11 +- .../task_info/kernel_ex_task_info.cc | 59 +- .../task_info/kernel_ex_task_info.h | 8 +- .../task_info/kernel_task_info.cc | 327 ++- .../task_info/kernel_task_info.h | 59 +- .../task_info/label_goto_task_info.cc | 1 + .../task_info/label_set_task_info.cc | 1 + .../task_info/memcpy_async_task_info.cc | 11 +- .../task_info/profiler_trace_task_info.cc | 7 +- .../task_info/stream_active_task_info.cc | 2 - .../task_info/stream_switch_task_info.cc | 7 +- .../task_info/stream_switchn_task_info.cc | 152 ++ .../task_info/stream_switchn_task_info.h | 52 + .../task_info/super_kernel/super_kernel.cc | 39 + .../task_info/super_kernel/super_kernel.h | 45 + .../super_kernel/super_kernel_factory.cc | 160 ++ .../super_kernel/super_kernel_factory.h | 47 + .../new_model_manager/task_info/task_info.h | 14 + .../new_model_manager/tbe_handle_store.cc | 2 +- .../load/new_model_manager/tbe_handle_store.h | 2 +- src/ge/graph/load/output/output.cc | 27 +- src/ge/graph/load/output/output.h | 7 +- src/ge/graph/manager/graph_manager.cc | 185 +- src/ge/graph/manager/graph_manager.h | 10 +- src/ge/graph/manager/graph_manager_utils.cc | 29 +- src/ge/graph/manager/graph_manager_utils.h | 13 +- src/ge/graph/manager/graph_mem_allocator.cc | 13 +- src/ge/graph/manager/graph_var_manager.cc | 33 +- src/ge/graph/manager/trans_var_data_utils.cc | 28 +- src/ge/graph/manager/trans_var_data_utils.h | 7 +- src/ge/graph/manager/util/debug.cc | 2 +- src/ge/graph/manager/util/debug.h | 2 +- src/ge/graph/manager/util/hcom_util.cc | 26 +- src/ge/graph/optimize/common/params.h | 4 - src/ge/graph/optimize/graph_optimize.cc | 7 +- src/ge/graph/optimize/optimizer/graph_pass.h | 93 - src/ge/graph/partition/engine_place.cc | 2 +- src/ge/graph/partition/engine_place.h | 8 +- src/ge/graph/partition/graph_partition.cc | 337 +-- src/ge/graph/partition/graph_partition.h | 71 +- .../passes/aicpu_constant_folding_pass.cc | 88 +- .../passes/aicpu_constant_folding_pass.h | 13 +- src/ge/graph/passes/atomic_addr_clean_pass.cc | 16 +- src/ge/graph/passes/base_pass.cc | 89 +- src/ge/graph/passes/base_pass.h | 25 +- src/ge/graph/passes/cast_remove_pass.cc | 138 ++ ...e_net_output_pass.h => cast_remove_pass.h} | 26 +- src/ge/graph/passes/cast_translate_pass.cc | 41 +- .../common_subexpression_elimination_pass.cc | 111 + .../common_subexpression_elimination_pass.h | 21 +- src/ge/graph/passes/compile_nodes_pass.cc | 2 + src/ge/graph/passes/compile_nodes_pass.h | 2 +- src/ge/graph/passes/constant_folding_pass.cc | 39 +- src/ge/graph/passes/dimension_compute_pass.cc | 2 +- src/ge/graph/passes/flow_ctrl_pass.cc | 27 +- .../graph/passes/folding_kernel/add_kernel.cc | 129 +- .../graph/passes/folding_kernel/add_kernel.h | 10 +- .../folding_kernel/broadcast_args_kernel.cc | 16 +- .../passes/folding_kernel/cast_kernel.cc | 46 +- .../folding_kernel/concat_offset_kernel.cc | 9 +- .../passes/folding_kernel/concat_v2_kernel.cc | 15 +- .../folding_kernel/dynamic_stitch_kernel.cc | 239 ++- .../folding_kernel/dynamic_stitch_kernel.h | 10 +- .../passes/folding_kernel/empty_kernel.cc | 6 +- .../passes/folding_kernel/fill_kernel.cc | 8 +- .../passes/folding_kernel/floordiv_kernel.cc | 6 +- .../passes/folding_kernel/floordiv_kernel.h | 2 +- .../passes/folding_kernel/floormod_kernel.cc | 17 +- .../passes/folding_kernel/gather_v2_kernel.cc | 2 + .../passes/folding_kernel/greater_kernel.cc | 1 + .../passes/folding_kernel/kernel_utils.cc | 20 +- .../passes/folding_kernel/maximum_kernel.cc | 6 +- .../graph/passes/folding_kernel/mul_kernel.cc | 91 +- .../graph/passes/folding_kernel/mul_kernel.h | 12 + .../passes/folding_kernel/permute_kernel.cc | 27 +- .../passes/folding_kernel/range_kernel.cc | 12 +- .../passes/folding_kernel/rank_kernel.cc | 5 +- .../folding_kernel/reduce_prod_kernel.cc | 19 +- .../passes/folding_kernel/reformat_kernel.cc | 21 +- .../passes/folding_kernel/reformat_kernel.h | 2 +- .../passes/folding_kernel/rsqrt_kernel.cc | 4 +- .../passes/folding_kernel/shape_kernel.cc | 6 +- .../passes/folding_kernel/shape_n_kernel.cc | 9 +- .../passes/folding_kernel/size_kernel.cc | 1 - .../passes/folding_kernel/slice_d_kernel.cc | 161 ++ .../passes/folding_kernel/slice_d_kernel.h | 35 + .../passes/folding_kernel/slice_kernel.cc | 14 +- .../passes/folding_kernel/squeeze_kernel.cc | 2 +- .../folding_kernel/ssd_prior_box_kernel.cc | 19 +- .../folding_kernel/strided_slice_kernel.cc | 79 +- .../folding_kernel/strided_slice_kernel.h | 7 +- .../graph/passes/folding_kernel/sub_kernel.cc | 109 +- .../graph/passes/folding_kernel/sub_kernel.h | 14 + .../passes/folding_kernel/transdata_kernel.cc | 37 +- .../passes/folding_kernel/unpack_kernel.cc | 91 + .../folding_kernel/unpack_kernel.h} | 28 +- src/ge/graph/passes/folding_pass.cc | 10 + src/ge/graph/passes/folding_pass.h | 4 +- .../graph/passes/get_original_format_pass.cc | 2 + src/ge/graph/passes/guarantee_const_pass.cc | 6 +- src/ge/graph/passes/hccl_memcpy_pass.cc | 82 +- src/ge/graph/passes/hccl_memcpy_pass.h | 3 + src/ge/graph/passes/infershape_pass.cc | 3 +- .../graph/passes/link_gen_mask_nodes_pass.cc | 9 +- .../graph/passes/link_gen_mask_nodes_pass.h | 6 +- src/ge/graph/passes/net_output_pass.h | 3 +- src/ge/graph/passes/next_iteration_pass.h | 1 - .../graph/passes/no_reshape_op_remove_pass.cc | 202 -- .../graph/passes/no_reshape_op_remove_pass.h | 68 - .../passes/no_use_reshape_remove_pass.cc | 14 +- src/ge/graph/passes/pass_manager.cc | 12 +- src/ge/graph/passes/pass_utils.h | 3 +- src/ge/graph/passes/permute_pass.cc | 145 +- .../passes/placeholder_with_default_pass.cc | 2 - src/ge/graph/passes/prevent_gradient_pass.cc | 1 - src/ge/graph/passes/print_op_pass.cc | 1 - src/ge/graph/passes/prune_pass.cc | 3 +- .../passes/replace_with_empty_const_pass.cc | 156 ++ .../passes/replace_with_empty_const_pass.h | 34 + src/ge/graph/passes/reshape_remove_pass.cc | 1 - .../passes/resource_pair_add_control_pass.cc | 1 - .../resource_pair_remove_control_pass.cc | 1 - .../same_transdata_breadth_fusion_pass.cc | 12 +- .../same_transdata_breadth_fusion_pass.h | 1 - src/ge/graph/passes/save_pass.cc | 4 +- src/ge/graph/passes/snapshot_pass.cc | 4 +- src/ge/graph/passes/stop_gradient_pass.cc | 3 +- .../graph/passes/switch_logic_remove_pass.cc | 33 +- .../graph/passes/switch_logic_remove_pass.h | 2 +- src/ge/graph/passes/switch_op_pass.cc | 2 - src/ge/graph/passes/switch_op_pass.h | 5 +- src/ge/graph/passes/switch_pass.cc | 5 +- .../passes/transop_breadth_fusion_pass.cc | 53 +- .../graph/passes/transop_depth_fusion_pass.cc | 36 +- .../graph/passes/transop_depth_fusion_pass.h | 4 +- .../transop_nearby_allreduce_fusion_pass.cc | 43 +- .../transop_without_reshape_fusion_pass.cc | 8 +- .../transop_without_reshape_fusion_pass.h | 3 +- src/ge/graph/passes/unused_const_pass.cc | 2 - src/ge/graph/passes/unused_op_remove_pass.cc | 3 +- src/ge/graph/passes/unused_op_remove_pass.h | 1 - src/ge/graph/passes/update_net_output_pass.cc | 169 -- .../graph/passes/var_is_initialized_op_pass.h | 2 - src/ge/graph/passes/variable_format_pass.cc | 2 - src/ge/graph/passes/variable_format_pass.h | 1 - src/ge/graph/passes/variable_op_pass.cc | 45 +- src/ge/graph/passes/variable_op_pass.h | 4 +- .../graph/passes/variable_prepare_op_pass.cc | 85 +- .../graph/passes/variable_prepare_op_pass.h | 1 + .../passes/variable_ref_delete_op_pass.cc | 8 +- .../passes/variable_ref_delete_op_pass.h | 4 +- src/ge/graph/preprocess/graph_preprocess.cc | 508 +++-- src/ge/graph/preprocess/graph_preprocess.h | 7 +- .../preprocess/insert_op/base_insert_op.cc | 222 -- .../preprocess/insert_op/base_insert_op.h | 22 +- .../graph/preprocess/insert_op/ge_aipp_op.cc | 479 ++++- .../graph/preprocess/insert_op/ge_aipp_op.h | 12 +- .../insert_op/util_insert_aipp_op.cc | 343 ++- .../insert_op/util_insert_aipp_op.h | 13 +- .../preprocess/multi_batch_copy_graph.cc | 207 +- .../graph/preprocess/multi_batch_copy_graph.h | 27 +- src/ge/inc/node_pass.h | 66 + src/ge/init/gelib.cc | 166 +- src/ge/init/gelib.h | 14 +- src/ge/ir_build/ge_ir_build.cc | 292 +++ src/ge/model/ge_model.cc | 1 + src/ge/omm/csa_interact.cc | 3 - .../opskernel_manager/ops_kernel_manager.cc | 2 +- src/ge/opskernel_manager/ops_kernel_manager.h | 4 +- src/ge/session/session_manager.cc | 60 +- src/ge/session/session_manager.h | 2 +- src/ge/single_op/single_op.cc | 9 - src/ge/single_op/single_op_model.cc | 18 +- src/ge/single_op/stream_resource.cc | 5 +- src/ge/single_op/task/tbe_task_builder.cc | 6 +- src/proto/op_mapping_info.proto | 2 + tests/depends/cce/CMakeLists.txt | 1 + tests/depends/cce/src/op_kernel_registry.cc | 29 + tests/depends/mmpa/src/mmpa_stub.cc | 9 + tests/depends/runtime/src/runtime_stub.cc | 26 +- tests/depends/slog/src/slog_stub.cc | 5 + tests/st/resnet50/common.cc | 289 +-- tests/st/resnet50/resnet50_train.cc | 16 +- tests/ut/common/graph/CMakeLists.txt | 1 + .../ge_graph/ge_model_serialize_unittest.cc | 2 +- .../testcase/ge_graph/ge_node_unittest.cc | 4 +- .../testcase/ge_graph/ge_tensor_unittest.cc | 18 +- tests/ut/ge/CMakeLists.txt | 9 +- tests/ut/ge/graph/ge_executor_unittest.cc | 4 +- tests/ut/ge/graph/graph_load_unittest.cc | 4 +- .../ge/graph/load/end_graph_task_unittest.cc | 5 - ...ew_model_manager_davinci_model_unittest.cc | 6 +- .../graph/load/output_net_output_unittest.cc | 10 +- .../ut/ge/graph/passes/addn_pass_unittest.cc | 2 +- .../dynamic_stitch_kernel_unittest.cc | 24 +- .../folding_kernel/mul_kernel_unittest.cc | 7 +- .../no_reshape_op_remove_pass_unittest.cc | 204 -- ...p_nearby_allreduce_fusion_pass_unittest.cc | 2 +- .../passes/update_net_output_pass_unittest.cc | 95 - .../ge_profiling_manager_unittest.cc | 13 +- .../inc/aicpu/common/aicpu_task_struct.h | 5 +- third_party/fwkacllib/inc/cce/cce_def.hpp | 0 .../fwkacllib/inc/cce/common/attr_list.hpp | 0 .../fwkacllib/inc/cce/common/catch.hpp | 0 .../fwkacllib/inc/cce/dnn_base_def.hpp | 0 third_party/fwkacllib/inc/cce/dnn_op.h | 2 +- third_party/fwkacllib/inc/cce/dnn_struct.hpp | 0 .../fwkacllib/inc/cce/dnn_struct_base.hpp | 0 .../fwkacllib/inc/cce/l2fusion_struct.hpp | 0 third_party/fwkacllib/inc/cce/taskdown_api.h | 8 + .../fwkacllib/inc/cce/taskdown_common.hpp | 0 third_party/fwkacllib/inc/hccl/base.h | 3 + third_party/fwkacllib/inc/mmpa/mmpa_api.h | 4 +- third_party/fwkacllib/inc/ops/aipp.h | 30 + third_party/fwkacllib/inc/ops/all_ops.h | 34 +- third_party/fwkacllib/inc/ops/array_ops.h | 419 ++-- third_party/fwkacllib/inc/ops/audio_ops.h | 115 +- third_party/fwkacllib/inc/ops/batch_ops.h | 125 +- third_party/fwkacllib/inc/ops/bitwise_ops.h | 27 +- .../fwkacllib/inc/ops/boosted_trees_ops.h | 31 +- .../inc/ops/candidate_sampling_ops.h | 418 ++-- third_party/fwkacllib/inc/ops/clip_boxes.h | 37 + .../fwkacllib/inc/ops/control_flow_ops.h | 206 +- third_party/fwkacllib/inc/ops/ctc_ops.h | 66 + third_party/fwkacllib/inc/ops/data_flow_ops.h | 1445 ++++++++++++- third_party/fwkacllib/inc/ops/decode_bbox.h | 33 + .../inc/ops/decode_boundaries_target.h | 31 + .../inc/ops/decode_cornerpoints_target_bg.h | 31 + ...decode_cornerpoints_target_wrt_center_v1.h | 32 + .../fwkacllib/inc/ops/decode_wheels_target.h | 31 + third_party/fwkacllib/inc/ops/dvpp_ops.h | 62 - .../inc/ops/elewise_calculation_ops.h | 405 ++-- .../fwkacllib/inc/ops/fastrcnn_predictions.h | 36 + .../inc/ops/fsrdetectionoutput_ops.h | 67 + third_party/fwkacllib/inc/ops/image_ops.h | 925 +++++++- third_party/fwkacllib/inc/ops/linalg_ops.h | 306 +++ third_party/fwkacllib/inc/ops/logging_ops.h | 47 +- third_party/fwkacllib/inc/ops/lookup_ops.h | 213 +- third_party/fwkacllib/inc/ops/math_ops.h | 238 ++ .../inc/ops/matrix_calculation_ops.h | 402 +++- .../fwkacllib/inc/ops/nn_batch_norm_ops.h | 228 +- .../fwkacllib/inc/ops/nn_calculation_ops.h | 464 ++-- third_party/fwkacllib/inc/ops/nn_detect_ops.h | 141 +- third_party/fwkacllib/inc/ops/nn_norm_ops.h | 264 ++- third_party/fwkacllib/inc/ops/nn_ops.h | 131 ++ .../fwkacllib/inc/ops/nn_pooling_ops.h | 189 +- .../fwkacllib/inc/ops/nn_training_ops.h | 570 ++++- .../fwkacllib/inc/ops/nonlinear_fuc_ops.h | 238 +- .../fwkacllib/inc/ops/npu_loss_scale_ops.h | 1 - third_party/fwkacllib/inc/ops/outfeed_ops.h | 16 + third_party/fwkacllib/inc/ops/pad_ops.h | 2 +- third_party/fwkacllib/inc/ops/parsing_ops.h | 17 + third_party/fwkacllib/inc/ops/power_ops.h | 49 + third_party/fwkacllib/inc/ops/quantize_ops.h | 42 +- .../fwkacllib/inc/ops/ragged_array_ops.h | 61 + .../fwkacllib/inc/ops/ragged_conversion_ops.h | 54 + .../fwkacllib/inc/ops/ragged_math_ops.h | 54 + third_party/fwkacllib/inc/ops/random_ops.h | 291 ++- third_party/fwkacllib/inc/ops/reduce_ops.h | 345 ++- .../fwkacllib/inc/ops/resource_variable_ops.h | 56 + third_party/fwkacllib/inc/ops/rnn.h | 80 + .../fwkacllib/inc/ops/roipooling_ops.h | 78 + third_party/fwkacllib/inc/ops/rpn_proposals.h | 54 + third_party/fwkacllib/inc/ops/sdca_ops.h | 86 + third_party/fwkacllib/inc/ops/selection_ops.h | 600 +++++- third_party/fwkacllib/inc/ops/set_ops.h | 91 +- third_party/fwkacllib/inc/ops/sparse_ops.h | 729 ++++++- .../fwkacllib/inc/ops/split_combination_ops.h | 235 +- .../inc/ops/ssddetectionoutput_ops.h | 65 + third_party/fwkacllib/inc/ops/state_ops.h | 32 +- .../fwkacllib/inc/ops/stateful_random_ops.h | 216 ++ .../fwkacllib/inc/ops/stateless_random_ops.h | 41 + third_party/fwkacllib/inc/ops/string_ops.h | 340 +++ third_party/fwkacllib/inc/ops/swap_co_ops.h | 56 + .../fwkacllib/inc/ops/transformation_ops.h | 210 +- .../inc/register/op_kernel_registry.h | 48 + .../fwkacllib/inc/register/op_registry.h | 20 +- third_party/fwkacllib/inc/runtime/base.h | 69 +- third_party/fwkacllib/inc/runtime/config.h | 4 +- third_party/fwkacllib/inc/runtime/context.h | 15 +- third_party/fwkacllib/inc/runtime/dev.h | 19 +- .../fwkacllib/inc/runtime/dvfsprofile.h | 2 +- third_party/fwkacllib/inc/runtime/event.h | 2 +- third_party/fwkacllib/inc/runtime/kernel.h | 129 +- third_party/fwkacllib/inc/runtime/mem.h | 34 +- third_party/fwkacllib/inc/runtime/rt_model.h | 62 +- third_party/fwkacllib/inc/runtime/stream.h | 2 +- third_party/fwkacllib/inc/tdt/tsd_client.h | 42 + third_party/fwkacllib/inc/toolchain/slog.h | 47 +- third_party/fwkacllib/version.info | 2 +- third_party/prebuild/x86_64/libslog.so | Bin 89288 -> 89440 bytes 500 files changed, 30116 insertions(+), 8175 deletions(-) create mode 100644 inc/external/ge/ge_ir_build.h create mode 100644 src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc create mode 100644 src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.h create mode 100644 src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc create mode 100644 src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.h create mode 100644 src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc create mode 100644 src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.h create mode 100644 src/ge/common/math/fp16_math.cc create mode 100644 src/ge/common/math/fp16_math.h create mode 100644 src/ge/ge_local_engine/engine/host_cpu_engine.cc create mode 100644 src/ge/ge_local_engine/engine/host_cpu_engine.h rename src/ge/graph/build/{graph_build.cc => graph_builder.cc} (80%) rename src/ge/graph/build/{graph_build.h => graph_builder.h} (97%) create mode 100644 src/ge/graph/build/label_allocator.cc create mode 100644 src/ge/graph/build/label_allocator.h rename src/ge/graph/build/{optimize_stream_graph.cc => stream_graph_optimizer.cc} (80%) rename src/ge/graph/build/{optimize_stream_graph.h => stream_graph_optimizer.h} (85%) create mode 100644 src/ge/graph/label/case_label_maker.cc create mode 100644 src/ge/graph/label/case_label_maker.h create mode 100644 src/ge/graph/label/if_label_maker.cc create mode 100644 src/ge/graph/label/if_label_maker.h create mode 100644 src/ge/graph/label/label_maker.cc create mode 100644 src/ge/graph/label/label_maker.h create mode 100644 src/ge/graph/label/label_maker_factory.h create mode 100644 src/ge/graph/label/partitioned_call_label_maker.cc create mode 100644 src/ge/graph/label/partitioned_call_label_maker.h create mode 100644 src/ge/graph/label/while_label_maker.cc create mode 100644 src/ge/graph/label/while_label_maker.h create mode 100644 src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc create mode 100644 src/ge/graph/load/new_model_manager/cpu_queue_schedule.h create mode 100644 src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc create mode 100644 src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.h create mode 100644 src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc create mode 100644 src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h create mode 100644 src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc create mode 100644 src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h delete mode 100644 src/ge/graph/optimize/optimizer/graph_pass.h create mode 100644 src/ge/graph/passes/cast_remove_pass.cc rename src/ge/graph/passes/{update_net_output_pass.h => cast_remove_pass.h} (55%) create mode 100644 src/ge/graph/passes/common_subexpression_elimination_pass.cc rename third_party/fwkacllib/inc/ops/aipp_data.h => src/ge/graph/passes/common_subexpression_elimination_pass.h (65%) create mode 100644 src/ge/graph/passes/folding_kernel/slice_d_kernel.cc create mode 100644 src/ge/graph/passes/folding_kernel/slice_d_kernel.h create mode 100644 src/ge/graph/passes/folding_kernel/unpack_kernel.cc rename src/ge/graph/{optimize/optimizer/pass.h => passes/folding_kernel/unpack_kernel.h} (60%) delete mode 100644 src/ge/graph/passes/no_reshape_op_remove_pass.cc delete mode 100644 src/ge/graph/passes/no_reshape_op_remove_pass.h create mode 100644 src/ge/graph/passes/replace_with_empty_const_pass.cc create mode 100644 src/ge/graph/passes/replace_with_empty_const_pass.h delete mode 100644 src/ge/graph/passes/update_net_output_pass.cc delete mode 100644 src/ge/graph/preprocess/insert_op/base_insert_op.cc create mode 100644 src/ge/inc/node_pass.h create mode 100644 src/ge/ir_build/ge_ir_build.cc create mode 100644 tests/depends/cce/src/op_kernel_registry.cc delete mode 100644 tests/ut/ge/graph/passes/no_reshape_op_remove_pass_unittest.cc delete mode 100644 tests/ut/ge/graph/passes/update_net_output_pass_unittest.cc mode change 100755 => 100644 third_party/fwkacllib/inc/cce/cce_def.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/common/attr_list.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/common/catch.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/dnn_base_def.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/dnn_struct.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/dnn_struct_base.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/l2fusion_struct.hpp mode change 100755 => 100644 third_party/fwkacllib/inc/cce/taskdown_common.hpp create mode 100644 third_party/fwkacllib/inc/ops/clip_boxes.h create mode 100644 third_party/fwkacllib/inc/ops/ctc_ops.h create mode 100644 third_party/fwkacllib/inc/ops/decode_bbox.h create mode 100644 third_party/fwkacllib/inc/ops/decode_boundaries_target.h create mode 100644 third_party/fwkacllib/inc/ops/decode_cornerpoints_target_bg.h create mode 100644 third_party/fwkacllib/inc/ops/decode_cornerpoints_target_wrt_center_v1.h create mode 100644 third_party/fwkacllib/inc/ops/decode_wheels_target.h delete mode 100644 third_party/fwkacllib/inc/ops/dvpp_ops.h create mode 100644 third_party/fwkacllib/inc/ops/fastrcnn_predictions.h create mode 100644 third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h create mode 100644 third_party/fwkacllib/inc/ops/power_ops.h create mode 100644 third_party/fwkacllib/inc/ops/ragged_array_ops.h create mode 100644 third_party/fwkacllib/inc/ops/ragged_conversion_ops.h create mode 100644 third_party/fwkacllib/inc/ops/ragged_math_ops.h create mode 100644 third_party/fwkacllib/inc/ops/resource_variable_ops.h create mode 100644 third_party/fwkacllib/inc/ops/roipooling_ops.h create mode 100644 third_party/fwkacllib/inc/ops/rpn_proposals.h create mode 100644 third_party/fwkacllib/inc/ops/sdca_ops.h create mode 100644 third_party/fwkacllib/inc/ops/ssddetectionoutput_ops.h create mode 100644 third_party/fwkacllib/inc/ops/stateful_random_ops.h create mode 100644 third_party/fwkacllib/inc/ops/swap_co_ops.h create mode 100644 third_party/fwkacllib/inc/register/op_kernel_registry.h diff --git a/inc/common/blocking_queue.h b/inc/common/blocking_queue.h index d91abd27..7a5e98cf 100644 --- a/inc/common/blocking_queue.h +++ b/inc/common/blocking_queue.h @@ -18,7 +18,6 @@ #define INC_COMMON_BLOCKING_QUEUE_H_ #include - #include #include #include @@ -87,7 +86,7 @@ class BlockingQueue { is_stoped_ = false; } - // if the queue stop , the function to release the unprocessed items will be call + // if the queue is stoped ,need call this function to release the unprocessed items std::list GetRemainItems() { std::unique_lock lock(mutex_); diff --git a/inc/common/dynamic_aipp.h b/inc/common/dynamic_aipp.h index 4873dbec..a687853f 100644 --- a/inc/common/dynamic_aipp.h +++ b/inc/common/dynamic_aipp.h @@ -19,10 +19,10 @@ #include -/// -/// @ingroup dnn -/// @brief struct define of dynamic aipp batch parameter. -/// +/** + * @ingroup dnn + * @brief struct define of dynamic aipp batch parameter. + */ typedef struct tagAippDynamicBatchPara { int8_t cropSwitch; // crop switch int8_t scfSwitch; // resize switch @@ -66,10 +66,10 @@ typedef struct tagAippDynamicBatchPara { int8_t reserve1[16]; // 32B assign, for ub copy } kAippDynamicBatchPara; -/// -/// @ingroup dnn -/// @brief struct definition of dynamic aipp parameter. lite:64+96*batchNum byte ; tiny:64+64*batchNum byte -/// +/** + * @ingroup dnn + * @brief struct define of dynamic aipp parameter. lite:64+96*batchNum byte ; tiny:64+64*batchNum byte + */ typedef struct tagAippDynamicPara { uint8_t inputFormat; // input format:YUV420SP_U8/XRGB8888_U8/RGB888_U8 int8_t cscSwitch; // csc switch diff --git a/inc/common/npu_error_define.h b/inc/common/npu_error_define.h index 249ea673..a4515cf6 100644 --- a/inc/common/npu_error_define.h +++ b/inc/common/npu_error_define.h @@ -61,19 +61,19 @@ typedef enum tagHiAiNpuModuleId { HIAI_DP = 23, } HiAiNpuModuleId; -// bit 31-bit30 to be hiai local +/* bit 31-bit30 to be hiai local */ #define HIAI_NPULOCAL_MASK 0xC0000000 #define SHIFT_LOCAL_MASK 30 #define HIAI_NPULOCAL_VAL_MASK 0x3 -// bit 29 -bit28 to be hiai aicpu code type +/* bit 29 -bit28 to be hiai aicpu code type */ #define HIAI_CODE_TYPE_MASK 0x30000000 #define SHIFT_CODE_MASK 28 #define HIAI_CODE_TYPE_VAL_MASK 0x3 -// bit 27 -bit25 to be hiai error level +/* bit 27 -bit25 to be hiai error level */ #define HIAI_ERROR_LEVEL_MASK 0x0E000000 #define SHIFT_ERROR_LVL_MASK 25 #define HIAI_ERROR_LEVEL_VAL_MASK 0x7 -// bit 24 -bit17 to be hiai mod +/* bit 24 -bit17 to be hiai mod */ #define HIAI_MODE_ID_MASK 0x01FE0000 #define SHIFT_MODE_MASK 17 #define HIAI_MODE_ID_VAL_MASK 0xFF diff --git a/inc/common/opskernel/ge_task_info.h b/inc/common/opskernel/ge_task_info.h index 74eee458..1b8c7584 100644 --- a/inc/common/opskernel/ge_task_info.h +++ b/inc/common/opskernel/ge_task_info.h @@ -19,13 +19,12 @@ #include #include - #include #include using std::string; namespace ge { -// DAVINCI_TRAIN/DAVINCI_CLOUD is not needed when GETaskKernelHcclInfo needed +// when need to eliminate GETaskKernelHcclInfo, so not need DAVINCI_TRAIN/DAVINCI_CLOUD struct GETaskKernelHcclInfo { string hccl_type; void *inputDataAddr; diff --git a/inc/common/opskernel/ops_kernel_info_store.h b/inc/common/opskernel/ops_kernel_info_store.h index df159998..52ceda91 100644 --- a/inc/common/opskernel/ops_kernel_info_store.h +++ b/inc/common/opskernel/ops_kernel_info_store.h @@ -21,7 +21,6 @@ #include #include #include - #include "./ge_task_info.h" #include "./ops_kernel_info_types.h" #include "cce/aicpu_engine_struct.h" @@ -29,7 +28,6 @@ #include "common/ge_inner_error_codes.h" #include "graph/node.h" #include "proto/task.pb.h" - using std::map; using std::string; using std::to_string; @@ -47,7 +45,7 @@ class OpsKernelInfoStore { // initialize opsKernelInfoStore virtual Status Initialize(const map &options) = 0; - // finalize opsKernelInfoStore + // close opsKernelInfoStore virtual Status Finalize() = 0; virtual Status CreateSession(const std::map &session_options) { return SUCCESS; } @@ -57,18 +55,20 @@ class OpsKernelInfoStore { // get all opsKernelInfo virtual void GetAllOpsKernelInfo(map &infos) const = 0; - // check whether opsKernelInfoStore is supported based on the operator attribute + // whether the opsKernelInfoStore is supported based on the operator attribute virtual bool CheckSupported(const OpDescPtr &opDescPtr, std::string &un_supported_reason) const = 0; virtual bool CheckAccuracySupported(const OpDescPtr &opDescPtr, std::string &un_supported_reason, bool realQuery = false) const { return CheckSupported(opDescPtr, un_supported_reason); } + // opsFlag opsFlag[0] indicates constant folding is supported or not + virtual void opsFlagCheck(const ge::Node &node, std::string &opsFlag){}; - // requirement of memory allocation + // memory allocation requirement virtual Status CalcOpRunningParam(Node &node) = 0; - // generate task for op + // generate task for op。 virtual Status GenerateTask(const Node &node, RunContext &context, std::vector &tasks) = 0; // only call fe engine interface to compile single op @@ -77,10 +77,10 @@ class OpsKernelInfoStore { // load task for op virtual Status LoadTask(GETaskInfo &task) { return SUCCESS; } - // only to call aicpu interface for generating task struct + // only call aicpu interface to generate task struct virtual Status GenSingleOpRunTask(const NodePtr &node, STR_FWK_OP_KERNEL &task, string &task_info) { return SUCCESS; } - // only to call aicpu interface for generating task struct + // only call aicpu interface to generate task struct virtual Status GenMemCopyTask(uint64_t count, STR_FWK_OP_KERNEL &task, string &task_info) { return SUCCESS; } }; } // namespace ge diff --git a/inc/common/opskernel/ops_kernel_info_types.h b/inc/common/opskernel/ops_kernel_info_types.h index 7ebf463d..d13840bd 100644 --- a/inc/common/opskernel/ops_kernel_info_types.h +++ b/inc/common/opskernel/ops_kernel_info_types.h @@ -37,6 +37,7 @@ struct RunContext { ge::Buffer weightsBuffer; std::vector graphStreamList; // all streams of graph, order by ge stream id(0,1,...) std::vector graphEventList; // all events of graph, order by ge event id(0,1,...) + std::vector graphLabelList; // all labels of graph, order by ge label id(0,1,...) }; struct Task { diff --git a/inc/common/optimizer/graph_optimizer.h b/inc/common/optimizer/graph_optimizer.h index cc972425..bce3cb18 100644 --- a/inc/common/optimizer/graph_optimizer.h +++ b/inc/common/optimizer/graph_optimizer.h @@ -19,7 +19,6 @@ #include #include - #include "./graph_optimizer_types.h" #include "common/ge_inner_error_codes.h" #include "common/opskernel/ops_kernel_info_types.h" @@ -39,19 +38,19 @@ class GraphOptimizer { // close graphOptimizer virtual Status Finalize() = 0; - // optimize original graph for FE quant optimization + // optimize original graph for FE quant optimize virtual Status OptimizeGraphPrepare(ComputeGraph &graph) { return SUCCESS; } - // optimize original graph used in the graph preparation stage + // optimize original graph, using in graph preparation stage virtual Status OptimizeOriginalGraph(ComputeGraph &graph) = 0; // optimize fused graph virtual Status OptimizeFusedGraph(ComputeGraph &graph) = 0; - // optimize the whole graph which will be used after graph merged + // optimize whole graph, using after graph merged stage virtual Status OptimizeWholeGraph(ComputeGraph &graph) = 0; - // get attributes of graph optimizer + // get attribute of graph optimizer virtual Status GetAttributes(GraphOptimizerAttribute &attrs) const = 0; // optimize streamed Graph diff --git a/inc/common/optimizer/graph_optimizer_types.h b/inc/common/optimizer/graph_optimizer_types.h index 5c760c0b..9e1ec96b 100644 --- a/inc/common/optimizer/graph_optimizer_types.h +++ b/inc/common/optimizer/graph_optimizer_types.h @@ -19,8 +19,6 @@ #include #include - -using std::string; namespace ge { enum OPTIMIZER_SCOPE { UNIT = 0, @@ -28,7 +26,7 @@ enum OPTIMIZER_SCOPE { }; struct GraphOptimizerAttribute { - string engineName; + std::string engineName; OPTIMIZER_SCOPE scope; }; } // namespace ge diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index a47c5889..bf9a10b4 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -20,6 +20,7 @@ #include #include #include +#include namespace ge { // Option key: graph run mode @@ -38,9 +39,11 @@ const char *const GE_AICPU_FLAG = "ge.aicpuFlag"; const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; +const char *const OPTION_EXEC_DUMP_STEP = "ge.exec.dumpStep"; // Hccl flag, if ge.exec.hcclFlag =1, it means load plugin for opskernel, else:ge.exec.hcclFlag =0 const char *const OPTION_EXEC_HCCL_FLAG = "ge.exec.hcclFlag"; const char *const OPTION_EXEC_ATOMIC_FLAG = "ge.exec.enable_atomic"; +const char *const OPTION_EXEC_DISABLE_REUSED_MEMORY = "ge.exec.disableReuseMemory"; // Option key: memory init const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize"; @@ -141,19 +144,43 @@ const std::string STREAM_MAX_PARALLEL_NUM = "ge.streamMaxParallelNum"; // congigure outputDatatype to setting net output type const std::string OUTPUT_DATATYPE = "ge.outputDatatype"; +// congigure opSelectImplmode to setting op select implmode +const std::string kOpSelectImplmode = "ge.opSelectImplmode"; + // configure whether to enable hcom parallel by session constructor options param, // its value should be "0" or "1", default value is "0" const std::string HCOM_PARALLEL = "ge.hcomParallel"; +// configure whether to use dynamic batch size +const char *const kDynamicBatchSize = "ge.dynamicBatchSize"; + +// configure whether to use dynamic image size +const char *const kDynamicImageSize = "ge.dynamicImageSize"; + // Configure auto tune mode, this option only take effect while AUTO_TUNE_FLAG is Y, // example: GA|RL, support configure multiple, split by | const std::string AUTO_TUNE_MODE = "ge.autoTuneMode"; +// Configure soc version , example: "Ascend310" +const std::string SOC_VERSION = "ge.socVersion"; + // Configure core type "VectorEngine", default value is "AIcoreEngine" const std::string CORE_TYPE = "ge.engineType"; -// Configure soc version , example: "Ascend310" -const std::string SOC_VERSION = "ge.socVersion"; +// Configure AICORE NUM +const std::string AICORE_NUM = "ge.aicoreNum"; + +// Configure L1FUSION +const std::string L1_FUSION = "ge.l1Fusion"; + +// Configure Small Channel flag +const std::string ENABLE_SMALL_CHANNEL = "ge.enableSmallChannel"; + +// Configure Compress Weight flag +const std::string ENABLE_COMPRESS_WEIGHT = "ge.enableCompressWeight"; + +// Configure fusion switch file path +const std::string FUSION_SWITCH_FILE = "ge.fusionSwitchFile"; // Save original model const std::string SAVE_ORIGINAL_MODEL = "ge.saveOriginalModel"; @@ -194,6 +221,28 @@ struct TensorInfo { DataDesc data; // tensor data ShapeDesc shapeInfo; // tensor shape }; +// for ir build +namespace ir_option { +static const char *const INPUT_FORMAT = "input_format"; +static const char *const INPUT_SHAPE = "input_shape"; +static const char *const OP_NAME_MAP = "op_name_map"; +static const char *const DYNAMIC_BATCH_SIZE = kDynamicBatchSize; +static const char *const DYNAMIC_IMAGE_SIZE = kDynamicImageSize; +static const char *const INSERT_OP_FILE = ge::INSERT_OP_FILE.c_str(); +static const char *const PRECISION_MODE = ge::PRECISION_MODE.c_str(); +static const char *const EXEC_DISABLE_REUSED_MEMORY = ge::OPTION_EXEC_DISABLE_REUSED_MEMORY; +static const char *const HEAD_STREAM = ge::HEAD_STREAM.c_str(); +static const char *const AUTO_TUNE_MODE = ge::AUTO_TUNE_MODE.c_str(); +static const char *const CORE_TYPE = ge::CORE_TYPE.c_str(); +static const char *const SOC_VERSION = ge::SOC_VERSION.c_str(); +// for interface: aclgrphBuildModel +const std::set ir_builder_suppported_options = { + INPUT_FORMAT, INPUT_SHAPE, OP_NAME_MAP, DYNAMIC_BATCH_SIZE, + DYNAMIC_IMAGE_SIZE, INSERT_OP_FILE, PRECISION_MODE, EXEC_DISABLE_REUSED_MEMORY, + AUTO_TUNE_MODE}; +// for interface: aclgrphBuildInitialize +const std::set global_options = {HEAD_STREAM, CORE_TYPE, SOC_VERSION}; +} // namespace ir_option } // namespace ge #endif // INC_EXTERNAL_GE_GE_API_TYPES_H_ diff --git a/inc/external/ge/ge_ir_build.h b/inc/external/ge/ge_ir_build.h new file mode 100644 index 00000000..5982ae90 --- /dev/null +++ b/inc/external/ge/ge_ir_build.h @@ -0,0 +1,75 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INC_EXTERNAL_GE_IR_BUILD_H_ +#define INC_EXTERNAL_GE_IR_BUILD_H_ + +#include +#include +#include +#include "graph/graph.h" +#include "graph/ge_error_codes.h" + +namespace ge { + +struct ModelBufferData { + std::shared_ptr data = nullptr; + uint64_t length; +}; + +/** + * @ingroup AscendCL + * @brief build model.Notice the model is stored in buffer + * + * @param global_options[IN] global init params for build + * @retval GRAPH_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +graphStatus aclgrphBuildInitialize(std::map global_options); + +/** + * @ingroup AscendCL + * @brief build model.Notice the model is stored in buffer + * + */ +void aclgrphBuildFinalize(); + +/** + * @ingroup AscendCL + * @brief build model.Notice the model is stored in buffer + * + * @param graph[IN] the graph ready to build + * @param options[IN] options used for build + * @param model[OUT] builded model + * @retval GRAPH_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +graphStatus aclgrphBuildModel(const ge::Graph &graph, const std::map &build_options, + ModelBufferData &model); + +/** + * @ingroup AscendCL + * @brief save model buffer to file + * + * @param output_file[IN] the file path to be saved + * @param model[IN] model buffer data + * @retval GRAPH_SUCCESS The function is successfully executed. + * @retval OtherValues Failure + */ +graphStatus aclgrphSaveModel(const string &output_file, const ModelBufferData &model); + +}; // namespace ge +#endif diff --git a/inc/external/graph/attr_value.h b/inc/external/graph/attr_value.h index f9635e6a..32fce04c 100644 --- a/inc/external/graph/attr_value.h +++ b/inc/external/graph/attr_value.h @@ -22,7 +22,7 @@ #include #include -#include "external/graph/ge_error_codes.h" +#include "./ge_error_codes.h" using std::make_shared; using std::map; diff --git a/inc/external/graph/graph.h b/inc/external/graph/graph.h index 6e074239..b4ebb435 100644 --- a/inc/external/graph/graph.h +++ b/inc/external/graph/graph.h @@ -22,7 +22,7 @@ #include #include -#include "external/graph/operator.h" +#include "./operator.h" namespace ge { class GraphImpl; diff --git a/inc/external/graph/inference_context.h b/inc/external/graph/inference_context.h index eb8fae3d..69079142 100644 --- a/inc/external/graph/inference_context.h +++ b/inc/external/graph/inference_context.h @@ -21,8 +21,8 @@ #include #include -#include "external/graph/tensor.h" -#include "external/graph/types.h" +#include "./tensor.h" +#include "./types.h" namespace ge { class InferenceContext; @@ -69,7 +69,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY InferenceContext { static std::unique_ptr Create(); private: - InferenceContext(std::unique_ptr &impl); + explicit InferenceContext(std::unique_ptr &impl); std::shared_ptr inference_context_impl_; }; } // namespace ge diff --git a/inc/external/graph/operator.h b/inc/external/graph/operator.h index 0ffa7313..ed2e639a 100644 --- a/inc/external/graph/operator.h +++ b/inc/external/graph/operator.h @@ -23,9 +23,9 @@ #include #include -#include "external/graph/ge_error_codes.h" -#include "external/graph/inference_context.h" -#include "external/graph/tensor.h" +#include "./ge_error_codes.h" +#include "./inference_context.h" +#include "./tensor.h" #ifndef USER_GE_LOGI #define USER_GE_LOGI(...) diff --git a/inc/external/graph/operator_factory.h b/inc/external/graph/operator_factory.h index 0078b904..f9ec7669 100644 --- a/inc/external/graph/operator_factory.h +++ b/inc/external/graph/operator_factory.h @@ -22,8 +22,8 @@ #include #include -#include "external/graph//operator.h" -#include "external/graph/ge_error_codes.h" +#include "./operator.h" +#include "./ge_error_codes.h" namespace ge { using OpCreator = std::function; diff --git a/inc/external/graph/operator_reg.h b/inc/external/graph/operator_reg.h index 85f8db03..2878b4eb 100644 --- a/inc/external/graph/operator_reg.h +++ b/inc/external/graph/operator_reg.h @@ -22,10 +22,10 @@ #include #include -#include "external/graph/operator.h" -#include "external/graph/operator_factory.h" -#include "external/graph/tensor.h" -#include "external/graph/types.h" +#include "./operator.h" +#include "./operator_factory.h" +#include "./tensor.h" +#include "./types.h" namespace ge { using std::function; @@ -60,7 +60,7 @@ class OpReg { \ private: \ void __##x() { \ - OpReg() + OpReg() #define ATTR(x, Type, ...) \ N(); \ @@ -86,7 +86,7 @@ class OpReg { void __attr_##x() { \ Operator::AttrRegister(#x, Op##Type(__VA_ARGS__)); \ string attr_name(#x); \ - (void)OpReg() + (void)OpReg() #define REQUIRED_ATTR(x, Type) \ N(); \ @@ -112,7 +112,7 @@ class OpReg { void __required_attr_##x() { \ Operator::RequiredAttrRegister(#x); \ string attr_name(#x); \ - (void)OpReg() + (void)OpReg() #define INPUT(x, t) \ N(); \ @@ -137,7 +137,7 @@ class OpReg { private: \ void __input_##x() { \ Operator::InputRegister(#x); \ - (void)OpReg() + (void)OpReg() #define OPTIONAL_INPUT(x, t) \ N(); \ @@ -162,7 +162,7 @@ class OpReg { private: \ void __optional_input_##x() { \ Operator::OptionalInputRegister(#x); \ - (void)OpReg() + (void)OpReg() #define OUTPUT(x, t) \ N(); \ @@ -179,7 +179,7 @@ class OpReg { private: \ void __out_##x() { \ Operator::OutputRegister(#x); \ - (void)OpReg() + (void)OpReg() #define DYNAMIC_INPUT(x, t) \ N(); \ @@ -206,7 +206,7 @@ class OpReg { \ private: \ void __dy_input_##x() { \ - (void)OpReg() + (void)OpReg() #define DYNAMIC_OUTPUT(x, t) \ N(); \ @@ -227,18 +227,18 @@ class OpReg { \ private: \ void __dy_output_##x() { \ - (void)OpReg() + (void)OpReg() #define PASTE(g_register, y) g_register##y -#define __OP_END_IMPL__(x, y) \ - N(); \ - } \ - static_assert( \ - std::is_same::value, \ - "The class name entered into the OP_END_FACTORY_REG needs to be the same as the operator name you define."); \ - } \ - ; \ - static const OperatorCreatorRegister PASTE(g_register, y)(#x, [](const std::string &name) { return x(name); }); \ +#define __OP_END_IMPL__(x, y) \ + N(); \ + } \ + static_assert( \ + std::is_same::value, \ + "The class name entered into the OP_END_FACTORY_REG needs to be the same as the operator name you define."); \ + } \ + ; \ + static const OperatorCreatorRegister PASTE(g_register, y)(#x, [](const std::string &name) { return x(name); }); \ } #define OP_END_FACTORY_REG(x) __OP_END_IMPL__(x, __COUNTER__) @@ -286,7 +286,7 @@ class OpReg { // Common shape inferencer #define ELMTWISE_INFER_SHAPEANDTYPE(in_name, out_name) \ - [](Operator op)->graphStatus { \ + [](Operator op) -> graphStatus { \ auto x_shape = op.GetInputDesc(in_name).GetShape().GetDims(); \ auto x_type = op.GetInputDesc(in_name).GetDataType(); \ TensorDesc op_output_desc = op.GetOutputDesc(out_name); \ @@ -300,7 +300,7 @@ graphStatus BroadCastInfer(const function()> &get_in1_shape, const function &y_shape)> &set_out_shape); #define BROADCAST_INFER(in1_name, in2_name, out_name) \ - [](Operator op)->graphStatus { \ + [](Operator op) -> graphStatus { \ return BroadCastInfer([&]() { return op.GetInputDesc(in1_name).GetShape().GetDims(); }, \ [&]() { return op.GetInputDesc(in2_name).GetShape().GetDims(); }, \ [&](const vector &y_shape) { \ diff --git a/inc/external/graph/tensor.h b/inc/external/graph/tensor.h index 5224c35c..f60d245b 100644 --- a/inc/external/graph/tensor.h +++ b/inc/external/graph/tensor.h @@ -22,8 +22,8 @@ #include #include -#include "external/graph/ge_error_codes.h" -#include "external/graph/types.h" +#include "./ge_error_codes.h" +#include "./types.h" namespace ge { class ShapeImpl; diff --git a/inc/external/graph/types.h b/inc/external/graph/types.h index 605b6c95..c76c5556 100644 --- a/inc/external/graph/types.h +++ b/inc/external/graph/types.h @@ -133,11 +133,13 @@ enum Format { FORMAT_FRACTAL_ZZ, FORMAT_FRACTAL_NZ, FORMAT_NCDHW, - FORMAT_DHWCK, // 3D filter input tensor format + FORMAT_DHWCN, // 3D filter input tensor format FORMAT_NDC1HWC0, FORMAT_FRACTAL_Z_3D, FORMAT_CN, FORMAT_NC, + FORMAT_DHWNC, + FORMAT_FRACTAL_Z_3D_TRANSPOSE, // 3D filter(transpose) input tensor format FORMAT_RESERVED, FORMAT_ALL }; diff --git a/inc/external/register/register.h b/inc/external/register/register.h index 87082bee..045a1570 100644 --- a/inc/external/register/register.h +++ b/inc/external/register/register.h @@ -47,6 +47,12 @@ class Tensor; class TBEPluginManager; } // namespace ge +namespace google { +namespace protobuf { +class Message; +} +} // namespace google + namespace domi { Status AutoMappingFn(const google::protobuf::Message *op_src, ge::Operator &op); Status AutoMappingFnDynamic(const google::protobuf::Message *op_src, ge::Operator &op, @@ -56,6 +62,8 @@ using google::protobuf::Message; class OpRegistrationDataImpl; using ParseParamFunc = std::function; +using FusionParseParamFunc = + std::function, ge::Operator &)>; class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData { public: @@ -71,15 +79,20 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistrationData { OpRegistrationData &ParseParamsFn(const ParseParamFunc &parseParamFn); + OpRegistrationData &FusionParseParamsFn(const FusionParseParamFunc &fusionParseParamFn); + OpRegistrationData &ImplyType(const domi::ImplyType &imply_type); OpRegistrationData &DelInputWithCond(int inputIdx, const std::string &attrName, bool attrValue); + OpRegistrationData &DelInputWithOriginalType(int input_idx, const std::string &ori_type); + domi::ImplyType GetImplyType() const; std::string GetOmOptype() const; std::set GetOriginOpTypeSet() const; domi::FrameworkType GetFrameworkType() const; ParseParamFunc GetParseParamFn() const; + FusionParseParamFunc GetFusionParseParamFn() const; private: std::shared_ptr impl_; @@ -103,5 +116,27 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpReceiver { namespace ge { using OpRegistrationData = domi::OpRegistrationData; using OpReceiver = domi::OpReceiver; + +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp { + public: + HostCpuOp() = default; + virtual ~HostCpuOp() = default; + + virtual graphStatus Compute(Operator &op, const std::map &inputs, + std::map &outputs) = 0; +}; + +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOpRegistrar { + public: + HostCpuOpRegistrar(const char *op_type, HostCpuOp *(*create_fn)()); +}; + +#define REGISTER_HOST_CPU_OP_BUILDER(name, op) REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(__COUNTER__, name, op) + +#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(ctr, name, op) REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op) + +#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op) \ + static ::ge::HostCpuOpRegistrar register_host_cpu_op##ctr __attribute__((unused)) = \ + ::ge::HostCpuOpRegistrar(name, []() -> ::ge::HostCpuOp * { return new (std::nothrow) op(); }) } // namespace ge #endif // INC_EXTERNAL_REGISTER_REGISTER_H_ diff --git a/inc/external/register/register_error_codes.h b/inc/external/register/register_error_codes.h index 5bfee8a2..5e0ed79f 100644 --- a/inc/external/register/register_error_codes.h +++ b/inc/external/register/register_error_codes.h @@ -22,7 +22,7 @@ #define DECLARE_ERRORNO(sysid, modid, name, value) \ const domi::Status name = \ - ((0xFF & ((uint8_t)sysid)) << 24) | ((0xFF & ((uint8_t)modid)) << 16) | (0xFFFF & ((uint16_t)value)); + ((0xFF & ((uint8_t)sysid)) << 24) | ((0xFF & ((uint8_t)modid)) << 16) | (0xFFFF & ((uint16_t)value)); #define DECLARE_ERRORNO_COMMON(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_COMMON, name, value) @@ -33,6 +33,7 @@ using Status = uint32_t; DECLARE_ERRORNO(0, 0, SUCCESS, 0); DECLARE_ERRORNO(0xFF, 0xFF, FAILED, 0xFFFFFFFF); DECLARE_ERRORNO_COMMON(PARAM_INVALID, 1); // 50331649 +DECLARE_ERRORNO(SYSID_FWK, 1, SCOPE_NOT_CHANGED, 201); } // namespace domi #endif // INC_EXTERNAL_REGISTER_REGISTER_ERROR_CODES_H_ diff --git a/inc/external/register/register_types.h b/inc/external/register/register_types.h index be090281..08d72713 100644 --- a/inc/external/register/register_types.h +++ b/inc/external/register/register_types.h @@ -48,6 +48,10 @@ typedef enum tagDomiTensorFormat { DOMI_TENSOR_BN_WEIGHT, DOMI_TENSOR_CHWN, // Android NN Depth CONV DOMI_TENSOR_FILTER_HWCK, // filter input tensor format + DOMI_TENSOR_NDHWC, + DOMI_TENSOR_NCDHW, + DOMI_TENSOR_DHWCN, // 3D filter input tensor format + DOMI_TENSOR_DHWNC, DOMI_TENSOR_RESERVED } domiTensorFormat_t; } // namespace domi diff --git a/inc/framework/common/debug/ge_log.h b/inc/framework/common/debug/ge_log.h index 1556fd07..e2023cb8 100644 --- a/inc/framework/common/debug/ge_log.h +++ b/inc/framework/common/debug/ge_log.h @@ -18,11 +18,13 @@ #define INC_FRAMEWORK_COMMON_DEBUG_GE_LOG_H_ #include +#include +#include #include "framework/common/ge_inner_error_codes.h" #include "toolchain/slog.h" -#define GE_MODULE_NAME GE +#define GE_MODULE_NAME static_cast(GE) // trace status of log enum TraceStatus { TRACE_INIT = 0, TRACE_RUNNING, TRACE_WAITING, TRACE_STOP }; @@ -35,15 +37,20 @@ enum TraceStatus { TRACE_INIT = 0, TRACE_RUNNING, TRACE_WAITING, TRACE_STOP }; #define GELOGO(...) GE_LOG_OPLOG(GE_MODULE_NAME, __VA_ARGS__) #define GELOGT(VALUE, ...) GE_LOG_TRACE(GE_MODULE_NAME, VALUE, __VA_ARGS__) -inline bool IsLogEnable(int module_name, int log_level) noexcept { - int32_t enable_event = 0; - int32_t dlog_level = dlog_getlevel(module_name, &enable_event); - if (dlog_level <= log_level) { +inline bool IsLogEnable(int module_name, int log_level) { + int32_t enable = CheckLogLevel(module_name, log_level); + // 1:enable, 0:disable + if (enable == 1) { return true; } return false; } +inline pid_t GetTid() { + thread_local static pid_t tid = syscall(__NR_gettid); + return tid; +} + #define GE_TIMESTAMP_START(stage) uint64_t startUsec_##stage = ge::GetCurrentTimestap() #define GE_TIMESTAMP_END(stage, stage_name) \ @@ -68,29 +75,35 @@ inline bool IsLogEnable(int module_name, int log_level) noexcept { GEEVENT("[GEPERFTRACE] The time cost of %s is [%lu] micro second, call num is %lu", (stage_name), time_of##stage, \ call_num_of##stage) -#define GE_LOG_ERROR(MOD_NAME, ERROR_CODE, fmt, ...) \ - dlog_error(static_cast(MOD_NAME), "%s: ErrorNo: %d(%s) " fmt, __FUNCTION__, ERROR_CODE, \ +#define GE_LOG_ERROR(MOD_NAME, ERROR_CODE, fmt, ...) \ + dlog_error(MOD_NAME, "%lu %s: ErrorNo: %d(%s) " fmt, GetTid(), __FUNCTION__, ERROR_CODE, \ ((GE_GET_ERRORNO_STR(ERROR_CODE)).c_str()), ##__VA_ARGS__) -#define GE_LOG_WARN(MOD_NAME, fmt, ...) \ - if (IsLogEnable(static_cast(MOD_NAME), DLOG_WARN)) \ - dlog_warn(static_cast(MOD_NAME), "%s:" fmt, __FUNCTION__, ##__VA_ARGS__) -#define GE_LOG_INFO(MOD_NAME, fmt, ...) \ - if (IsLogEnable(static_cast(MOD_NAME), DLOG_INFO)) \ - dlog_info(static_cast(MOD_NAME), "%s:" fmt, __FUNCTION__, ##__VA_ARGS__) -#define GE_LOG_DEBUG(MOD_NAME, fmt, ...) \ - if (IsLogEnable(static_cast(MOD_NAME), DLOG_DEBUG)) \ - dlog_debug(static_cast(MOD_NAME), "%s:" fmt, __FUNCTION__, ##__VA_ARGS__) -#define GE_LOG_EVENT(MOD_NAME, fmt, ...) dlog_event(static_cast(MOD_NAME), "%s:" fmt, __FUNCTION__, ##__VA_ARGS__) +#define GE_LOG_WARN(MOD_NAME, fmt, ...) \ + if (IsLogEnable(MOD_NAME, DLOG_WARN)) dlog_warn(MOD_NAME, "%lu %s:" fmt, GetTid(), __FUNCTION__, ##__VA_ARGS__) +#define GE_LOG_INFO(MOD_NAME, fmt, ...) \ + if (IsLogEnable(MOD_NAME, DLOG_INFO)) dlog_info(MOD_NAME, "%lu %s:" fmt, GetTid(), __FUNCTION__, ##__VA_ARGS__) +#define GE_LOG_DEBUG(MOD_NAME, fmt, ...) \ + if (IsLogEnable(MOD_NAME, DLOG_DEBUG)) dlog_debug(MOD_NAME, "%lu %s:" fmt, GetTid(), __FUNCTION__, ##__VA_ARGS__) +#define GE_LOG_EVENT(MOD_NAME, fmt, ...) dlog_event(MOD_NAME, "%lu %s:" fmt, GetTid(), __FUNCTION__, ##__VA_ARGS__) #define GE_LOG_OPLOG(MOD_NAME, fmt, ...) \ - Dlog(static_cast(MOD_NAME), DLOG_OPLOG, "%s:" fmt, __FUNCTION__, ##__VA_ARGS__) -#define GE_LOG_TRACE(MOD_NAME, value, fmt, ...) \ - do { \ - TraceStatus stat = value; \ - const char *const TraceStatStr[] = {"INIT", "RUNNING", "WAITING", "STOP"}; \ - int idx = static_cast(stat); \ - char *k = const_cast("status"); \ - char *v = const_cast(TraceStatStr[idx]); \ - KeyValue kv = {k, v}; \ - DlogWithKV(static_cast(MOD_NAME), DLOG_TRACE, &kv, 1, "%s:" fmt, __FUNCTION__, ##__VA_ARGS__); \ + Dlog(MOD_NAME, DLOG_OPLOG, "%lu %s:" fmt, GetTid(), __FUNCTION__, ##__VA_ARGS__) + +#define GE_LOG_TRACE(MOD_NAME, value, fmt, ...) \ + do { \ + TraceStatus stat = value; \ + const char *const TraceStatStr[] = {"INIT", "RUNNING", "WAITING", "STOP"}; \ + int idx = static_cast(stat); \ + char *k = const_cast("status"); \ + char *v = const_cast(TraceStatStr[idx]); \ + KeyValue kv = {k, v}; \ + DlogWithKV(static_cast(MOD_NAME), DLOG_TRACE, &kv, 1, "%lu %s:" fmt, GetTid(), __FUNCTION__, ##__VA_ARGS__); \ } while (0) + +// print memory when it is greater than 1KB. +#define GE_PRINT_DYNAMIC_MEMORY(FUNC, PURPOSE, SIZE) \ + do { \ + if ((SIZE) > 1024) { \ + GELOGI("MallocMemory, func=%s, size=%zu, purpose=%s", (#FUNC), static_cast(SIZE), (PURPOSE)); \ + } \ + } while (0); #endif // INC_FRAMEWORK_COMMON_DEBUG_GE_LOG_H_ diff --git a/inc/framework/common/debug/log.h b/inc/framework/common/debug/log.h index fb5418b0..b16aa3fa 100644 --- a/inc/framework/common/debug/log.h +++ b/inc/framework/common/debug/log.h @@ -29,7 +29,18 @@ using cce::CC_STATUS_SUCCESS; using cce::ccStatus_t; -#define GE_LOGE(...) DAV_LOGE("GE", __VA_ARGS__) +#if !defined(__ANDROID__) && !defined(ANDROID) +#define DOMI_LOGE(...) DAV_LOGE("DOMI", __VA_ARGS__) +#else +#include +#if defined(BUILD_VERSION_PERF) +#define DOMI_LOGE(fmt, ...) +#else +// The Android system has strict log control. Do not modify the log. +#define DOMI_LOGE(fmt, ...) \ + __android_log_print(ANDROID_LOG_ERROR, "NPU_FMK", "%s %s(%d)::" #fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#endif +#endif // ge marco #define GE_LOGI_IF(condition, ...) \ @@ -44,7 +55,7 @@ using cce::ccStatus_t; #define GE_LOGE_IF(condition, ...) \ if ((condition)) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ } // If expr is not SUCCESS, print the log and return the same value @@ -52,7 +63,7 @@ using cce::ccStatus_t; do { \ const ge::Status _status = (expr); \ if (_status != ge::SUCCESS) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ return _status; \ } \ } while (0); @@ -62,7 +73,7 @@ using cce::ccStatus_t; do { \ const ge::Status _status = (expr); \ if (_status != ge::SUCCESS) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ } \ } while (0); @@ -75,6 +86,15 @@ using cce::ccStatus_t; } \ } while (0); +// If expr is not GRAPH_SUCCESS, print the log and return FAILED +#define GE_CHK_GRAPH_STATUS_RET(expr, ...) \ + do { \ + if ((expr) != ge::GRAPH_SUCCESS) { \ + DOMI_LOGE(__VA_ARGS__); \ + return FAILED; \ + } \ + } while (0); + // If expr is not SUCCESS, print the log and execute a custom statement #define GE_CHK_STATUS_EXEC(expr, exec_expr, ...) \ do { \ @@ -91,25 +111,11 @@ using cce::ccStatus_t; (void)msg.append(ge::StringUtils::FormatString(__VA_ARGS__)); \ (void)msg.append( \ ge::StringUtils::FormatString(" Error Code:0x%X(%s)", _status, GET_ERRORNO_STR(_status).c_str())); \ - GE_LOGE("%s", msg.c_str()); \ + DOMI_LOGE("%s", msg.c_str()); \ return _status; \ } \ } while (0); -// If expr is not true, print the Info log and return the specified status -#define GE_CHK_BOOL_RET_STATUS_LOGI(expr, _status, ...) \ - do { \ - bool b = (expr); \ - if (!b) { \ - std::string msg; \ - (void)msg.append(StringUtils::FormatString(__VA_ARGS__)); \ - (void)msg.append( \ - StringUtils::FormatString(" Check result false, status: 0x%X %s", _status, GET_ERRORNO_STR(_status).c_str())); \ - GELOGI("%s", msg.c_str()); \ - return _status; \ - } \ - } while (0); - // If expr is not true, print the log and return the specified status #define GE_CHK_BOOL_RET_STATUS_NOLOG(expr, _status, ...) \ do { \ @@ -124,7 +130,7 @@ using cce::ccStatus_t; { \ bool b = (expr); \ if (!b) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ exec_expr; \ } \ }; @@ -163,7 +169,7 @@ using cce::ccStatus_t; { \ bool b = (expr); \ if (b) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ exec_expr; \ } \ }; @@ -182,7 +188,7 @@ using cce::ccStatus_t; { \ bool b = (expr); \ if (b) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ exec_expr; \ return; \ } \ @@ -193,7 +199,7 @@ using cce::ccStatus_t; { \ bool b = (expr); \ if (b) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ exec_expr; \ return _status; \ } \ @@ -210,62 +216,42 @@ using cce::ccStatus_t; // -----------------runtime related macro definitions------------------------------- // If expr is not RT_ERROR_NONE, print the log -#define GE_CHK_RT(expr) \ - do { \ - rtError_t _rt_ret = (expr); \ - if (_rt_ret != RT_ERROR_NONE) { \ - GE_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ - } \ +#define GE_CHK_RT(expr) \ + do { \ + rtError_t _rt_ret = (expr); \ + if (_rt_ret != RT_ERROR_NONE) { \ + DOMI_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ + } \ } while (0); // If expr is not RT_ERROR_NONE, print the log and execute the exec_expr expression -#define GE_CHK_RT_EXEC(expr, exec_expr) \ - { \ - rtError_t _rt_ret = (expr); \ - if (_rt_ret != RT_ERROR_NONE) { \ - GE_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ - exec_expr; \ - } \ +#define GE_CHK_RT_EXEC(expr, exec_expr) \ + { \ + rtError_t _rt_ret = (expr); \ + if (_rt_ret != RT_ERROR_NONE) { \ + DOMI_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ + exec_expr; \ + } \ } // If expr is not RT_ERROR_NONE, print the log and return -#define GE_CHK_RT_RET(expr) \ - do { \ - rtError_t _rt_ret = (expr); \ - if (_rt_ret != RT_ERROR_NONE) { \ - GE_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ - return ge::RT_FAILED; \ - } \ +#define GE_CHK_RT_RET(expr) \ + do { \ + rtError_t _rt_ret = (expr); \ + if (_rt_ret != RT_ERROR_NONE) { \ + DOMI_LOGE("Call rt api failed, ret: 0x%X", _rt_ret); \ + return ge::RT_FAILED; \ + } \ } while (0); // ------------------------cce related macro definitions---------------------------- // If expr is not CC_STATUS_SUCCESS, print the log -#define GE_CHK_CCE(expr) \ - do { \ - ccStatus_t _cc_ret = (expr); \ - if (_cc_ret != CC_STATUS_SUCCESS) { \ - GE_LOGE("Call cce api failed, ret: 0x%X", _cc_ret); \ - } \ - } while (0); - -// If expr is not CC_STATUS_SUCCESS, print the log and execute the exec_expr expression -#define GE_CHK_CCE_EXEC(expr, exec_expr) \ - do { \ - ccStatus_t _cc_ret = (expr); \ - if (_cc_ret != CC_STATUS_SUCCESS) { \ - GE_LOGE("Call cce api failed, ret: 0x%X", _cc_ret); \ - exec_expr; \ - } \ - } while (0); - -// If expr is not CC_STATUS_SUCCESS, print the log and return -#define GE_CHK_CCE_RET(expr) \ - do { \ - ccStatus_t _cc_ret = (expr); \ - if (_cc_ret != CC_STATUS_SUCCESS) { \ - GE_LOGE("Call cce api failed, ret: 0x%X", _cc_ret); \ - return ge::CCE_FAILED; \ - } \ +#define GE_CHK_CCE(expr) \ + do { \ + ccStatus_t _cc_ret = (expr); \ + if (_cc_ret != CC_STATUS_SUCCESS) { \ + DOMI_LOGE("Call cce api failed, ret: 0x%X", _cc_ret); \ + } \ } while (0); // If expr is true, execute exec_expr without printing logs @@ -281,37 +267,8 @@ using cce::ccStatus_t; try { \ exec_expr0; \ } catch (const std::bad_alloc &) { \ - GE_LOGE("Make shared failed"); \ + DOMI_LOGE("Make shared failed"); \ exec_expr1; \ } -#define GE_CHECK_INT32_MUL_OVERFLOW(a, b, ...) \ - do { \ - if ((a) > 0) { \ - if ((b) > 0) { \ - if ((a) > (INT32_MAX / (b))) { \ - GE_LOGE(__VA_ARGS__); \ - return ge::FAILED; \ - } \ - } else { \ - if ((b) < (INT32_MIN / (a))) { \ - GE_LOGE(__VA_ARGS__); \ - return ge::FAILED; \ - } \ - } \ - } else { \ - if ((b) > 0) { \ - if ((a) < (INT32_MAX / (b))) { \ - GE_LOGE(__VA_ARGS__); \ - return ge::FAILED; \ - } \ - } else { \ - if (((a) != 0) && ((b) < (INT32_MAX / (a)))) { \ - GE_LOGE(__VA_ARGS__); \ - return ge::FAILED; \ - } \ - } \ - } \ - } while (0); - #endif // INC_FRAMEWORK_COMMON_DEBUG_LOG_H_ diff --git a/inc/framework/common/ge_inner_error_codes.h b/inc/framework/common/ge_inner_error_codes.h index f01ede03..4b5538d3 100644 --- a/inc/framework/common/ge_inner_error_codes.h +++ b/inc/framework/common/ge_inner_error_codes.h @@ -152,7 +152,6 @@ GE_ERRORNO_GRAPH(GE_GRAPH_OPTIMIZE_RUN_GRAPH_INVALID, 11, "Get computeGraph by g GE_ERRORNO_GRAPH(GE_GRAPH_OPTIMIZE_INSERT_DYN_OP_FAILED, 12, "Graph which insert dynamic op failed."); // 1343242252 GE_ERRORNO_GRAPH(GE_GRAPH_OPTIMIZE_PREPROCESS_FAILED, 13, "Graph preprocess failed."); // 1343242253 GE_ERRORNO_GRAPH(GE_GRAPH_OPTIMIZE_GRAPH_FUSION_FAILED, 14, "Graph fusion failed."); // 1343242254 -GE_ERRORNO_GRAPH(GE_GRAPH_OPTIMIZE_TINY_CAL_CHECK_FAILED, 15, "Check tiny calibration failed."); // 1343242255 GE_ERRORNO_GRAPH(GE_GRAPH_OPTIMIZE_CALIBRATION_FAILED, 16, "Calibration failed."); // 1343242256 GE_ERRORNO_GRAPH(GE_GRAPH_SUBGRAPH_NUM_ZERO, 17, "Graph partition success, but subGraph num is 0."); // 1343242257 GE_ERRORNO_GRAPH(GE_GRAPH_SUBGRAPH_ENGINENAME_REPEATED, 18, "Graph subGraph engine name is repeated."); // 1343242258 @@ -204,15 +203,16 @@ GE_ERRORNO_GRAPH(GE_GRAPH_NODE_SEARCHER_GET_GRAPH_REBUILD_FAILED, 60, GE_ERRORNO_GRAPH(GE_GRAPH_NODE_SEARCHER_SET_GRAPH_FINISH_REBUILD_GRAPH_FAILED, 61, "Failed set graph finish rebuild in node searcher."); // 1343242301 GE_ERRORNO_GRAPH(GE_GRAPH_VARIABLE_OP_PASS_FAILED, 62, "Failed to run variable pass."); // 1343242302 -// Optimize errocode -GE_ERRORNO_GRAPH(TO_BE_DELETED, 200, "The node of the graph to be deleted."); -GE_ERRORNO_GRAPH(NOT_CHANGED, 201, "NThe node of the graph not changed."); // Engine_manager module error code definition GE_ERRORNO_ENGINE(GE_ENG_INIT_FAILED, 0, "Failed to initialize engine."); // 1343246336 GE_ERRORNO_ENGINE(GE_ENG_FINALIZE_FAILED, 1, "Engine finalize failed."); // 1343246337 GE_ERRORNO_ENGINE(GE_ENG_MEMTYPE_ERROR, 2, "Memory type HBM is necessary when engine is in device"); // 1343246338 +// Optimize errocode +GE_ERRORNO_GRAPH(TO_BE_DELETED, 63, "The node of the graph to be deleted."); // 1343242303 +GE_ERRORNO_GRAPH(NOT_CHANGED, 64, "The node of the graph no changed."); // 1343242304 + // Ops module error code definition GE_ERRORNO_OPS(GE_OPS_KERNEL_STORE_INIT_FAILED, 0, "Failed to initialize OpsKernelInfoStore."); // 1343250432 GE_ERRORNO_OPS(GE_OPS_GRAPH_OPTIMIZER_INIT_FAILED, 1, "Failed to initialize GraphOptimizer."); // 1343250433 diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h index c3a26c49..6ff3404e 100644 --- a/inc/framework/common/ge_types.h +++ b/inc/framework/common/ge_types.h @@ -24,8 +24,7 @@ #include "common/fmk_error_codes.h" #include "ge/ge_api_error_codes.h" - -using std::string; +#include "external/graph/types.h" namespace ge { enum RuntimeType { HOST = 0, DEVICE = 1 }; @@ -56,7 +55,7 @@ struct DataBuffer { /// /// @ingroup domi_ome -/// @brief External inputdata +/// @brief External input data /// struct InputData { uint32_t index; // Index of input data @@ -65,13 +64,14 @@ struct InputData { uint32_t model_id; // Model ID required for data processing uint64_t request_id = 0; // Request ID std::vector blobs; // Actual input data, currently only supports one input + bool is_dynamic_batch = false; // Whether is dynamic batch size scene, default:false + std::string batch_label; // Gear used for current inference in dynamic batch scene }; -// The definition of output result structure +/// Output result structure definition struct OutputData { uint32_t index; // Index of input data uint32_t model_id; // The model ID corresponding to the processing result - /// Output data cache, arranged in sequence of output operators. /// If the operator has multiple outputs, /// the data buffer order of the operator is the same as that defined in the @@ -142,11 +142,31 @@ struct Options { bool deployMode; bool isAICPUMode; bool enable_atomic; - string podName; + std::string podName; int64_t rankId; - string rankTableFile; + std::string rankTableFile; int32_t ge_hccl_flag = 0; int32_t physical_device_id; }; + +// Profiling info of task +struct TaskDescInfo { + std::string op_name; + uint32_t block_dim; + uint32_t task_id; + uint32_t stream_id; +}; + +// Profiling info of graph +struct ComputeGraphDescInfo { + std::string op_name; + std::string op_type; + std::vector input_format; + std::vector> input_shape; + std::vector input_data_type; + std::vector output_format; + std::vector> output_shape; + std::vector output_data_type; +}; } // namespace ge #endif // INC_FRAMEWORK_COMMON_GE_TYPES_H_ diff --git a/inc/framework/common/helper/model_helper.h b/inc/framework/common/helper/model_helper.h index 6513265f..c918c039 100644 --- a/inc/framework/common/helper/model_helper.h +++ b/inc/framework/common/helper/model_helper.h @@ -19,7 +19,6 @@ #include #include -#include #include "common/fmk_types.h" #include "common/helper/om_file_helper.h" @@ -33,36 +32,41 @@ class ModelHelper { ModelHelper() = default; ~ModelHelper(); - Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param, const std::string &output_file); - Status SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::string &output_file); - Status LoadModel(const ge::ModelData &model_data); + Status SaveToOmModel(const GeModelPtr& ge_model, const SaveParam& save_param, const std::string& output_file, + ge::ModelBufferData& model); + Status SaveOriginalGraphToOmModel(const ge::Graph& graph, const std::string& output_file); + Status LoadModel(const ge::ModelData& model_data); + Status GetModelBufferData(ge::ModelBufferData& model); - ModelFileHeader *GetFileHeader() { return file_header_; } + ModelFileHeader* GetFileHeader() { return file_header_; } GeModelPtr GetGeModel(); + void SetSaveMode(bool val) { is_offline_ = val; } + bool GetSaveMode(void) const { return is_offline_; } - static Status TransModelToGeModel(const ModelPtr &model, GeModelPtr &ge_model); - static Status TransGeModelToModel(const GeModelPtr &geModelPtr, ModelPtr &modelPtr); + static Status TransModelToGeModel(const ModelPtr& model, GeModelPtr& ge_model); + static Status TransGeModelToModel(const GeModelPtr& geModelPtr, ModelPtr& modelPtr); private: bool is_assign_model_ = false; - ModelFileHeader *file_header_ = nullptr; + bool is_offline_ = true; + ModelFileHeader* file_header_ = nullptr; // Encrypted model need delete temp model and unencrypted model need not delete model - uint8_t *model_addr_tmp_ = nullptr; + uint8_t* model_addr_tmp_ = nullptr; uint32_t model_len_tmp_ = 0; GeModelPtr model_; - ModelHelper(const ModelHelper &); - ModelHelper &operator=(const ModelHelper &); - Status GenerateGeModel(OmFileLoadHelper &om_load_helper); - Status LoadModelData(OmFileLoadHelper &om_load_helper); - void SetModelToGeModel(ge::Model &model); - Status LoadWeights(OmFileLoadHelper &om_load_helper); - Status LoadTask(OmFileLoadHelper &om_load_helper); - Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper); + ModelHelper(const ModelHelper&); + ModelHelper& operator=(const ModelHelper&); + Status GenerateGeModel(OmFileLoadHelper& om_load_helper); + Status LoadModelData(OmFileLoadHelper& om_load_helper); + void SetModelToGeModel(ge::Model& model); + Status LoadWeights(OmFileLoadHelper& om_load_helper); + Status LoadTask(OmFileLoadHelper& om_load_helper); + Status LoadTBEKernelStore(OmFileLoadHelper& om_load_helper); Status ReleaseLocalModelData() noexcept; - Status SaveModelPartition(std::shared_ptr &om_file_save_helper, ModelPartitionType type, - const uint8_t *data, size_t size); + Status SaveModelPartition(std::shared_ptr& om_file_save_helper, ModelPartitionType type, + const uint8_t* data, size_t size); }; } // namespace ge #endif // INC_FRAMEWORK_COMMON_HELPER_MODEL_HELPER_H_ diff --git a/inc/framework/common/helper/om_file_helper.h b/inc/framework/common/helper/om_file_helper.h index daabd118..1e4cee9b 100644 --- a/inc/framework/common/helper/om_file_helper.h +++ b/inc/framework/common/helper/om_file_helper.h @@ -20,10 +20,12 @@ #include #include +#include "external/ge/ge_ir_build.h" #include "framework/common/fmk_types.h" -#include "framework/common/ge_types.h" #include "framework/common/types.h" +#include "framework/common/ge_types.h" +using ProcParam = struct PROC_PARAM; using std::string; using std::vector; @@ -80,9 +82,10 @@ class OmFileSaveHelper { const std::vector &GetModelPartitions() const; - Status SaveModel(const SaveParam &save_param, const char *target_file); + Status SaveModel(const SaveParam &save_param, const char *target_file, ge::ModelBufferData &model, + bool is_offline = true); - Status SaveModelToFile(const char *output_file); + Status SaveModelToFile(const char *output_file, ge::ModelBufferData &model, bool is_offline = true); ModelFileHeader model_header_; OmFileContext context_; diff --git a/inc/framework/common/l2_cache_optimize.h b/inc/framework/common/l2_cache_optimize.h index a68ebcf2..c65f67b3 100644 --- a/inc/framework/common/l2_cache_optimize.h +++ b/inc/framework/common/l2_cache_optimize.h @@ -120,4 +120,4 @@ class L2CacheOptimize { }; } // namespace ge -#endif // INC_FRAMEWORK_COMMON_L2_CACHE_OPTIMIZE_H_ +#endif // INC_FRAMEWORK_COMMON_L2_CACHE_OPTIMIZE_H_ \ No newline at end of file diff --git a/inc/framework/common/op/attr_define.h b/inc/framework/common/op/attr_define.h index 1e2c3ab4..536a860e 100644 --- a/inc/framework/common/op/attr_define.h +++ b/inc/framework/common/op/attr_define.h @@ -649,6 +649,8 @@ extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_M extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_MODEL_EVENT_NUM; +extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_MODEL_LABEL_NUM; + extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_MODEL_MEMORY_SIZE; extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_MODEL_WEIGHT_SIZE; @@ -801,6 +803,8 @@ extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_N extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_NAME_NET_OUTPUT_FORMAT; extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_NAME_NET_OUTPUT_DATATYPE; +// For constant folding +extern FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::string ATTR_NO_NEED_CONSTANT_FOLDING; } // namespace domi #endif // INC_FRAMEWORK_COMMON_OP_ATTR_DEFINE_H_ diff --git a/inc/framework/common/op/attr_value_util.h b/inc/framework/common/op/attr_value_util.h index 98f27c24..8a90cfa2 100644 --- a/inc/framework/common/op/attr_value_util.h +++ b/inc/framework/common/op/attr_value_util.h @@ -17,11 +17,12 @@ #ifndef INC_FRAMEWORK_COMMON_OP_ATTR_VALUE_UTIL_H_ #define INC_FRAMEWORK_COMMON_OP_ATTR_VALUE_UTIL_H_ +#include #include #include -#include -#include "graph/debug/ge_attr_define.h" + #include "common/types.h" +#include "graph/debug/ge_attr_define.h" #include "proto/om.pb.h" using domi::AttrDef; diff --git a/inc/framework/common/op/ge_op_utils.h b/inc/framework/common/op/ge_op_utils.h index dd933de7..87cf54d8 100644 --- a/inc/framework/common/op/ge_op_utils.h +++ b/inc/framework/common/op/ge_op_utils.h @@ -18,7 +18,6 @@ #define INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_ #include - #include #include @@ -56,6 +55,15 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t SWITCH_TR FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t SWITCH_DATA_INPUT; FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t SWITCH_PRED_INPUT; +// FunctionOp +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t IF_COND_INPUT; +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t FOR_START_INPUT; +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t FOR_LIMIT_INPUT; +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t FOR_DELTA_INPUT; +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint32_t FOR_DATA_INPUT; + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const int NORMAL_TENSOR_SIZE; + class OpUtils { public: /// @@ -164,15 +172,23 @@ class OpUtils { /// static Status ConvertAippParams(const GeAttrValue::NamedAttrs &aipp_attr, domi::AippOpParams *aipp_params); static Status TransferDim(const std::vector &dim, std::vector &dim_vector); - static void SliceData(std::vector &input, int64_t chunk_size, std::vector &output, int64_t begin, - int64_t out_dim, int64_t stride); + template + static void SliceData(const std::vector &input, int64_t chunk_size, std::vector &output, + int64_t begin, int64_t out_dim, int64_t stride); + template + static Status SetDataByDataType(size_t out_size, const std::vector &chunk_input, + const std::vector &chunk_output, GeTensor *output); + template + static Status SetOutputSliceDataByDataType(void *data, int64_t data_size, const std::vector &input_dims, + const std::vector &begin, const std::vector &output_dims, + ge::GeTensor *output, const std::vector &stride); static Status SetOutputSliceData(void *data, int64_t data_size, int32_t data_type, std::vector &input_dims, std::vector &begin, std::vector &output_dims, ge::GeTensor *output, std::vector &stride); /// /// @ingroup domi_omg - /// @brief Convert the convolution weight data from [h, w, c, k] to [k, c, h, w] + /// @brief Convert the convolutional weight data from [h, w, c, k] to [k, c, h, w] /// @param [in] input Weight data in HWCK format /// @param [in] H value of H dimension /// @param [in] W value of W dimension @@ -183,7 +199,7 @@ class OpUtils { static void TransDataHWCK2KCHW(const void *input, int64_t H, int64_t W, int64_t C, int64_t K, void **output); /// /// @ingroup domi_omg - /// @brief Converts the convolution weight data from [k, c, h, w] to [h, w, c, k]. + /// @brief Converts the convolutional weight data from [k, c, h, w] to [h, w, c, k]. /// @param [in] input Weight data in HWCK format /// @param [in] K value of K dimension /// @param [in] C value of C dimension @@ -222,7 +238,6 @@ using CceTensorDescriptorPtr = std::shared_ptr; class CceTensorDescriptor { public: explicit CceTensorDescriptor(ccTensorDescriptor_t cc_tensor); - CceTensorDescriptor(const CceTensorDescriptor &) = delete; CceTensorDescriptor &operator=(const CceTensorDescriptor &) = delete; diff --git a/inc/framework/common/op/op_parser_util.h b/inc/framework/common/op/op_parser_util.h index e64ddc92..49b4350a 100644 --- a/inc/framework/common/op/op_parser_util.h +++ b/inc/framework/common/op/op_parser_util.h @@ -22,7 +22,7 @@ #include #include -namespace domi { +namespace ge { // general const float DEFAULT_ALPHA_VALUE = 1.0; const float DEFAULT_BETA_VALUE = 0.0; @@ -421,5 +421,5 @@ const uint32_t MULTI_SHAPE_INPUT_NUM = 2; // Shufflechannel const uint32_t SHUFFLECHANNEL_DEFAULT_GROUP = 1; -} // namespace domi +} // namespace ge #endif // INC_FRAMEWORK_COMMON_OP_OP_PARSER_UTIL_H_ diff --git a/inc/framework/common/scope_guard.h b/inc/framework/common/scope_guard.h index 79d71311..2154648d 100644 --- a/inc/framework/common/scope_guard.h +++ b/inc/framework/common/scope_guard.h @@ -25,7 +25,7 @@ /// MAKE_GUARD([&] { Release Resource 1 }) /// Acquire Resource 2 // MAKE_GUARD([&] { Release Resource 2 }) -#define GE_MAKE_GUARD(var, callback) ge::ScopeGuard make_guard_##var(callback) +#define GE_MAKE_GUARD(var, callback) ScopeGuard make_guard_##var(callback) #define GE_DISMISS_GUARD(var) make_guard_##var.Dismiss() namespace ge { diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h index 10ddc473..0adc812d 100644 --- a/inc/framework/common/types.h +++ b/inc/framework/common/types.h @@ -156,6 +156,7 @@ REGISTER_OPTYPE_DECLARE(GATHER, "Gather"); REGISTER_OPTYPE_DECLARE(REALDIV, "RealDiv"); REGISTER_OPTYPE_DECLARE(PACK, "Pack"); REGISTER_OPTYPE_DECLARE(SLICE, "Slice"); +REGISTER_OPTYPE_DECLARE(SLICED, "SliceD"); REGISTER_OPTYPE_DECLARE(FLOORDIV, "FloorDiv"); REGISTER_OPTYPE_DECLARE(SQUEEZE, "Squeeze"); REGISTER_OPTYPE_DECLARE(STRIDEDSLICE, "StridedSlice"); @@ -188,6 +189,19 @@ REGISTER_OPTYPE_DECLARE(REFNEXTITERATION, "RefNextIteration"); REGISTER_OPTYPE_DECLARE(EXIT, "Exit"); REGISTER_OPTYPE_DECLARE(REFEXIT, "RefExit"); REGISTER_OPTYPE_DECLARE(CONTROLTRIGGER, "ControlTrigger"); +REGISTER_OPTYPE_DECLARE(SYMBOLICGRADIENT, "SymbolicGradient"); +REGISTER_OPTYPE_DECLARE(REMOTECALL, "RemoteCall"); +REGISTER_OPTYPE_DECLARE(_IF, "_If"); +REGISTER_OPTYPE_DECLARE(STATELESSIF, "StatelessIf"); +REGISTER_OPTYPE_DECLARE(IF, "If"); +REGISTER_OPTYPE_DECLARE(CASE, "Case"); +REGISTER_OPTYPE_DECLARE(_WHILE, "_While"); +REGISTER_OPTYPE_DECLARE(WHILE, "While"); +REGISTER_OPTYPE_DECLARE(STATELESSWHILE, "StatelessWhile"); +REGISTER_OPTYPE_DECLARE(FOR, "For"); +REGISTER_OPTYPE_DECLARE(PARTITIONEDCALL, "PartitionedCall"); +REGISTER_OPTYPE_DECLARE(STATEFULPARTITIONEDCALL, "StatefulPartitionedCall"); +REGISTER_OPTYPE_DECLARE(FAKEPARAM, "FakeParam"); REGISTER_OPTYPE_DECLARE(TRANSPOSE, "Transpose"); REGISTER_OPTYPE_DECLARE(TRANSPOSED, "TransposeD"); REGISTER_OPTYPE_DECLARE(CAST, "Cast"); @@ -424,6 +438,12 @@ REGISTER_OPTYPE_DECLARE(STREAMMERGE, "StreamMerge"); REGISTER_OPTYPE_DECLARE(ENDGRAPH, "EndGraph"); REGISTER_OPTYPE_DECLARE(SEND, "Send"); REGISTER_OPTYPE_DECLARE(RECV, "Recv"); + +REGISTER_OPTYPE_DECLARE(LABELSET, "LabelSet"); +REGISTER_OPTYPE_DECLARE(LABELGOTO, "LabelGoto"); +REGISTER_OPTYPE_DECLARE(LABELSWITCH, "LabelSwitch"); +REGISTER_OPTYPE_DECLARE(LABELSWITCHBYINDEX, "LabelSwitchByIndex"); + REGISTER_OPTYPE_DECLARE(ATOMICADDRCLEAN, "AtomicAddrClean"); REGISTER_OPTYPE_DECLARE(ABS_GRAD, "AbsGrad"); @@ -1032,14 +1052,11 @@ struct BasicInfo { uint32_t workspace_size; // workspace uint32_t total_size; // total memory size }; - #pragma pack() // Cancels single-byte alignment } // namespace ge namespace domi { - /// @brief Data structure definition related to task sinking -/// Build model enum BuildMode { GEN_TASK_WITHOUT_L2FUSION = 3, // Carrying task data (L2 convergence function disabled) GEN_TASK_WITHOUT_FUSION = 4, // Carrying task data (all convergence functions disabled) diff --git a/inc/framework/common/util.h b/inc/framework/common/util.h index d7a1822c..4c37c01e 100644 --- a/inc/framework/common/util.h +++ b/inc/framework/common/util.h @@ -30,6 +30,14 @@ #include "framework/common/ge_inner_error_codes.h" #include "mmpa/mmpa_api.h" +#define GE_CHECK_POSITIVE_SIZE_RANGE(size) \ + do { \ + if (size <= 0) { \ + DOMI_LOGE(param[#size] is not a positive number); \ + return PARAM_INVALID; \ + } \ + } while (0) + #define CHECK_FALSE_EXEC(expr, exec_expr, ...) \ { \ bool b = (expr); \ @@ -50,21 +58,6 @@ if (var) GE_CHK_RT(rtStreamDestroy(var)); \ }); -#define GE_MAKE_GUARD_RTEVENT(var) \ - GE_MAKE_GUARD(var, [&] { \ - if (var) GE_CHK_RT(rtEventDestroy(var)); \ - }); - -#define GE_MAKE_GUARD_TENSOR(var) \ - GE_MAKE_GUARD(var, [&] { \ - if (var) GE_CHK_CCE(ccDestroyTensorDescriptor(&var)); \ - }); - -#define GE_MAKE_GUARD_FILTER_DESC(var) \ - GE_MAKE_GUARD(var, [&] { \ - if (var) GE_CHK_CCE(ccDestroyFilterDescriptor(&var)); \ - }); - // For propagating errors when calling a function. #define GE_RETURN_IF_ERROR(expr) \ do { \ @@ -76,7 +69,7 @@ do { \ const ::ge::Status _status = (expr); \ if (_status) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ return _status; \ } \ } while (0) @@ -85,7 +78,7 @@ #define GE_RETURN_WITH_LOG_IF_TRUE(condition, ...) \ do { \ if (condition) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ return ge::FAILED; \ } \ } while (0) @@ -95,7 +88,7 @@ do { \ bool _condition = (condition); \ if (!_condition) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ return ge::FAILED; \ } \ } while (0) @@ -104,7 +97,7 @@ #define GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(condition, ...) \ do { \ if (condition) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ return ge::PARAM_INVALID; \ } \ } while (0) @@ -114,111 +107,90 @@ do { \ bool _condition = (condition); \ if (!_condition) { \ - GE_LOGE(__VA_ARGS__); \ + DOMI_LOGE(__VA_ARGS__); \ return ge::PARAM_INVALID; \ } \ } while (0) // Check if the parameter is null. If yes, return PARAM_INVALID and record the error -#define GE_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - GE_LOGE(param[#val] must not be null.); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE(param[#val] must not be null.); \ + return ge::PARAM_INVALID; \ + } \ } while (0) -// Check if the parameter is null. If yes, return PARAM_INVALID and record the error -#define GE_CHECK_NOTNULL_JUST_RETURN(val) \ - do { \ - if (val == nullptr) { \ - GE_LOGE(param[#val] must not be null.); \ - return; \ - } \ +// Check if the parameter is null. If yes, just return and record the error +#define GE_CHECK_NOTNULL_JUST_RETURN(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE(param[#val] must not be null.); \ + return; \ + } \ } while (0) // Check whether the parameter is null. If so, execute the exec_expr expression and record the error log -#define GE_CHECK_NOTNULL_EXEC(val, exec_expr) \ - do { \ - if (val == nullptr) { \ - GE_LOGE(param[#val] must not be null.); \ - exec_expr; \ - } \ +#define GE_CHECK_NOTNULL_EXEC(val, exec_expr) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE(param[#val] must not be null.); \ + exec_expr; \ + } \ } while (0) // Check whether the parameter is null. If yes, return directly and record the error log -#define GE_RT_VOID_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - GE_LOGE(param[#val] must not be null.); \ - return; \ - } \ +#define GE_RT_VOID_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE(param[#val] must not be null.); \ + return; \ + } \ } while (0) // Check if the parameter is null. If yes, return false and record the error log -#define GE_RT_FALSE_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - GE_LOGE(param[#val] must not be null.); \ - return false; \ - } \ +#define GE_RT_FALSE_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + DOMI_LOGE(param[#val] must not be null.); \ + return false; \ + } \ } while (0) // Check if the parameter is out of bounds -#define GE_CHECK_SIZE(size) \ - do { \ - if (size == 0) { \ - GE_LOGE(param[#size] is out of range); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_SIZE(size) \ + do { \ + if (size == 0) { \ + DOMI_LOGE(param[#size] is out of range); \ + return ge::PARAM_INVALID; \ + } \ } while (0) -// Macros that define the size variable -#define GE_DEFINE_BYTE_SIZE(_var_name, _expr, _sizeof) \ - uint32_t _var_name; \ - do { \ - uint32_t _expr_size = (_expr); \ - uint32_t _sizeof_size = (_sizeof); \ - if (_expr_size > (0xffffffff) / _sizeof_size) { \ - GE_LOGE(byte size : #_var_name is out of range); \ - return ge::PARAM_INVALID; \ - } \ - _var_name = _sizeof_size * _expr_size; \ - } while (0); - // Check if the container is empty -#define GE_CHECK_VECTOR_NOT_EMPTY(vector) \ - do { \ - if (vector.empty()) { \ - GE_LOGE(param[#vector] is empty !); \ - return ge::FAILED; \ - } \ - } while (0) - -#define GE_CHECK_POSITIVE_SIZE_RANGE(size) \ - do { \ - if (size <= 0) { \ - GE_LOGE(param[#size] is not a positive number); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_VECTOR_NOT_EMPTY(vector) \ + do { \ + if (vector.empty()) { \ + DOMI_LOGE(param[#vector] is empty !); \ + return ge::FAILED; \ + } \ } while (0) // Check if the value on the left is greater than or equal to the value on the right -#define GE_CHECK_GE(lhs, rhs) \ - do { \ - if (lhs < rhs) { \ - GE_LOGE(param[#lhs] is less than[#rhs]); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_GE(lhs, rhs) \ + do { \ + if (lhs < rhs) { \ + DOMI_LOGE(param[#lhs] is less than[#rhs]); \ + return ge::PARAM_INVALID; \ + } \ } while (0) // Check if the value on the left is less than or equal to the value on the right -#define GE_CHECK_LE(lhs, rhs) \ - do { \ - if (lhs > rhs) { \ - GE_LOGE(param[#lhs] is greater than[#rhs]); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_LE(lhs, rhs) \ + do { \ + if (lhs > rhs) { \ + DOMI_LOGE(param[#lhs] is greater than[#rhs]); \ + return ge::PARAM_INVALID; \ + } \ } while (0) #define GE_DELETE_NEW_SINGLE(var) \ diff --git a/inc/framework/dlog/log.h b/inc/framework/dlog/log.h index 899a98b1..8126720c 100644 --- a/inc/framework/dlog/log.h +++ b/inc/framework/dlog/log.h @@ -52,10 +52,10 @@ #define DLOG_DECLARE(level) \ void Log_##level(const char *mod_name, const char *func, const char *file, int line, const char *format, ...) -namespace ge { +namespace domi { DLOG_DECLARE(INFO); DLOG_DECLARE(WARNING); DLOG_DECLARE(ERROR); -} // namespace ge +} // namespace domi #endif // INC_FRAMEWORK_DLOG_LOG_H_ diff --git a/inc/framework/engine/dnnengine.h b/inc/framework/engine/dnnengine.h index 34cb2569..142ac229 100644 --- a/inc/framework/engine/dnnengine.h +++ b/inc/framework/engine/dnnengine.h @@ -38,7 +38,7 @@ struct DNNEngineAttribute { std::vector mem_type; uint32_t compute_cost; enum RuntimeType runtime_type; // HOST, DEVICE - // set this attribute if the inputformat of engine must be specific, otherwise set FORMAT_RESERVED + // If engine input format must be specific, set this attribute, else set FORMAT_RESERVED Format engine_input_format; Format engine_output_format; }; diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h index 45398397..96f204b2 100644 --- a/inc/framework/executor/ge_executor.h +++ b/inc/framework/executor/ge_executor.h @@ -26,6 +26,7 @@ #include "common/types.h" #include "graph/tensor.h" #include "runtime/base.h" +#include "common/dynamic_aipp.h" namespace ge { class ModelListenerAdapter; @@ -33,12 +34,15 @@ class ModelListenerAdapter; class SingleOp; struct RunModelData { - uint32_t index; // Data index - uint32_t model_id; // Model id - std::vector blobs; // All input/output data buffer - uint32_t timestamp; // Data creation time - uint32_t timeout; // Processing timeout - uint64_t request_id = 0; // Request ID + uint32_t index; // Data index + uint32_t modelId; + std::vector blobs; // All input/output data buffer + uint32_t timestamp; // Data creation time + uint32_t timeout; // Processing timeout + uint64_t request_id = 0; // Request ID + uint64_t dynamic_batch_size = 0; // Dynamic batch size scene, set dynamic size, not supported by default:0 + uint64_t dynamic_image_height = 0; // Dynamic image size scene, set image height, not supported by default:0 + uint64_t dynamic_image_width = 0; // Dynamic image size scene, set image width, not supported by default:0 }; class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { @@ -46,12 +50,13 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { GeExecutor(); ~GeExecutor() = default; ge::Status Initialize(); + ge::Status Finalize(); // Load model ge::Status LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key, int32_t priority, std::shared_ptr listener); - ge::Status UnloadModel(uint32_t model_id); + ge::Status UnloadModel(uint32_t modelId); ge::Status RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data); @@ -59,6 +64,52 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { ge::Status GetModelDescInfo(uint32_t model_id, std::vector &input_desc, std::vector &output_desc); + /// + /// @ingroup ge + /// @brief Set dynamic batch size + /// @param [in] model_id: model id allocate from manager + /// @param [in] dynamic_input_addr: dynamic input addr created by user + /// @param [in] length: length of dynamic input addr + /// @param [in] batch_size: batch size entered by user in dynamic multi-batch scenario + /// @return execute result + /// + ge::Status SetDynamicBatchSize(uint32_t model_id, void *dynamic_input_addr, uint64_t length, uint64_t batch_size); + + /// + /// @ingroup ge + /// @brief Set dynamic image info + /// @param [in] model_id: model id allocate from manager + /// @param [in] dynamic_input_addr: dynamic input addr created by user + /// @param [in] length: length of dynamic input addr + /// @param [in] image_height: image height entered by user in dynamic multi-resolution scenario + /// @param [in] image_width: image width entered by user in dynamic multi-resolution scenario + /// @return execute result + /// + ge::Status SetDynamicImageSize(uint32_t model_id, void *dynamic_input_addr, uint64_t length, uint64_t image_height, + uint64_t image_width); + /// + /// @ingroup ge + /// @brief Get dynamic batch_info + /// @param [in] model_id + /// @param [out] batch_info + /// @return execute result + /// + ge::Status GetDynamicBatchInfo(uint32_t model_id, std::vector> &batch_info); + + /// + /// @ingroup ge + /// @brief Set dynamic image info + /// @param [in] model_id: model id allocate from manager + /// @param [in] dynamic_input_addr: dynamic input addr created by user + /// @param [in] length: length of dynamic input addr + /// @param [in] aippBatchPara: kAippDynamicBatchPara vector by user in dynamic aipp + /// @param [in] aippParms: kAippDynamicPara by user in dynamic aipp + /// @return execute result + /// + ge::Status SetDynamicAippData(uint32_t model_id, void *dynamic_input_addr, uint64_t length, + const std::vector &aippBatchPara, + const kAippDynamicPara &aippParms); + ge::Status GetModelDescInfoForZeroCopy(uint32_t model_id, std::vector &input_desc, std::vector &output_desc); @@ -147,7 +198,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { /// ge::Status GetMemAndWeightSize(const void *model_data, size_t model_size, size_t &mem_size, size_t &weight_size); - static ge::Status LoadSingleOp(const std::string &model_name, const ge::ModelData &model_data, void *stream, + static ge::Status LoadSingleOp(const std::string &modelName, const ge::ModelData &modelData, void *stream, SingleOp **single_op); static ge::Status ExecuteAsync(SingleOp *executor, const std::vector &inputs, @@ -156,8 +207,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor { static ge::Status ReleaseSingleOpResource(void *stream); private: - static bool is_init_; - std::vector> listener_adapters_; + static bool isInit_; }; ge::Status ModelInfoParser(const ge::ModelData &model, ge::ModelInfo &model_info); diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h index 1013a581..9bc13f24 100644 --- a/inc/framework/generator/ge_generator.h +++ b/inc/framework/generator/ge_generator.h @@ -21,7 +21,7 @@ #include #include #include - +#include "ge/ge_ir_build.h" #include "common/ge_inner_error_codes.h" #include "graph/ge_tensor.h" #include "graph/graph.h" @@ -45,6 +45,8 @@ class GeGenerator { Status GenerateOfflineModel(const Graph &graph, const std::string &file_name_prefix, const std::vector &inputs = std::vector()); + Status GenerateOnlineModel(const Graph &graph, const vector &inputs, ge::ModelBufferData &model); + /// /// @ingroup ge /// @brief: Build single OP in Model. @@ -58,6 +60,8 @@ class GeGenerator { const std::vector &outputs, const std::string &model_file_name); private: + Status GenerateModel(const Graph &graph, const string &file_name_prefix, const vector &inputs, + ge::ModelBufferData &model, bool is_offline = true); class Impl; std::shared_ptr impl_; diff --git a/inc/framework/generator/generator_api.h b/inc/framework/generator/generator_api.h index 39b4da8b..71c6832e 100644 --- a/inc/framework/generator/generator_api.h +++ b/inc/framework/generator/generator_api.h @@ -24,7 +24,6 @@ extern "C" { #endif typedef uint32_t Status_t; -using Status_t = uint32_t; typedef void *OpAttr_t; typedef void *OpTensor_t; diff --git a/inc/framework/memory/memory_assigner.h b/inc/framework/memory/memory_assigner.h index bbf0939b..34c58d26 100644 --- a/inc/framework/memory/memory_assigner.h +++ b/inc/framework/memory/memory_assigner.h @@ -23,7 +23,7 @@ #include "graph/node.h" namespace ge { -const int64_t kMemAlignSize = 512; +const int64_t MEM_ALIGN_SIZE = 512; class MemoryAssigner { public: explicit MemoryAssigner(ge::ComputeGraphPtr compute_graph) : compute_graph_(std::move(compute_graph)) {} @@ -39,4 +39,4 @@ class MemoryAssigner { ge::ComputeGraphPtr compute_graph_; }; } // namespace ge -#endif // INC_FRAMEWORK_MEMORY_MEMORY_ASSIGNER_H_ \ No newline at end of file +#endif // INC_FRAMEWORK_MEMORY_MEMORY_ASSIGNER_H_ diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h index 67834c77..d2599856 100644 --- a/inc/framework/omg/omg_inner_types.h +++ b/inc/framework/omg/omg_inner_types.h @@ -31,7 +31,6 @@ using domi::DOMI_TENSOR_ND; using domi::DOMI_TENSOR_RESERVED; using domi::domiTensorFormat_t; -using domi::FMK_TYPE_RESERVED; using domi::FrameworkType; using std::map; using std::string; @@ -44,10 +43,10 @@ namespace ge { * @brief run model */ enum RunMode { - kGeOmModel = 0, // generate offline model file - kModelToJson = 1, // convert to JSON file - kOnlyPreCheck = 3, // only for pre-check - kPbtxtToJson = 5 // pbtxt to json + GEN_OM_MODEL = 0, // generate offline model file + MODEL_TO_JSON = 1, // convert to JSON file + ONLY_PRE_CHECK = 3, // only for pre-check + PBTXT_TO_JSON = 5 // pbtxt to json }; /// @@ -56,10 +55,10 @@ enum RunMode { /// enum HighPrecisionMode { // the FP16 high-precision function is disabled in common mode - kHighPrecisonDefault = 0, + HIGH_PRECISION_DEFAULT = 0, - // high-precision mode, in which FP16 high-precision mode (Convolution/FullConnect/AvgPooling are involved) is enable - kHighPrecisionFP16 = 1 + // high-precision mode, enabling FP16 high-precision mode (Convolution/FullConnect/AvgPooling are involved) + HIGH_PRECISION_FP16 = 1 }; /// @@ -99,21 +98,23 @@ struct OmgContext { // preferential format used by the entire network domiTensorFormat_t net_format = DOMI_TENSOR_RESERVED; domi::FrameworkType type = domi::FMK_TYPE_RESERVED; - RunMode run_mode = kOnlyPreCheck; + RunMode run_mode = ONLY_PRE_CHECK; bool train_flag = false; // whether to use FP16 high precision - int32_t fp16_high_precision = kHighPrecisonDefault; + int32_t fp16_high_precision = HIGH_PRECISION_DEFAULT; std::string output_type; // Save the name of the entire network: Some special operators are used to determine a network. Some operators in the - // network require special processing based on the specific network. - // e.g:faster-rcnn, the FirstStageProcessor module is determined as the Faster-R-CNN network based on the scope - // fusion. Then, the conv+reshape operators in the FirstStageBoxPredictor/BoxEncodingPredictor scope are combined. The - // convolution kernel rearrangement reshape operator needs to be deleted for the convolution kernel. + // network require special processing based on the specific network. e.g:faster-rcnn, the FirstStageProcessor module + // is determined as the Faster-R-CNN network based on the scope fusion. Then, the conv+reshape operators in the + // FirstStageBoxPredictor/BoxEncodingPredictor scope are combined. The convolution kernel rearrangement reshape + // operator needs to be deleted for the convolution kernel. std::string net_name; - // whether to enable dynamic batch - bool enable_l2dynamic = false; + // Whether to use dynamic batch size or dynamic image size + bool is_dynamic_input = false; + std::string dynamic_batch_size; + std::string dynamic_image_size; }; } // namespace ge diff --git a/inc/framework/omg/version.h b/inc/framework/omg/version.h index 993f0a8f..ac649d83 100644 --- a/inc/framework/omg/version.h +++ b/inc/framework/omg/version.h @@ -32,15 +32,7 @@ class PlatformVersionManager { PlatformVersionManager() = delete; ~PlatformVersionManager() = delete; static Status GetPlatformVersion(std::string &ver) { -#if defined PLATFORM_PHOENIX - ver = "3.51.z"; -#elif defined PLATFORM_ORLANDO - ver = "3.31.z"; -#elif defined PLATFORM_MINI ver = "1.11.z"; -#elif defined PLATFORM_CLOUD - ver = "1.61.z"; -#endif std::vector version_splits = StringUtils::Split(ver, '.'); GE_IF_BOOL_EXEC(version_splits.size() < 3, GELOGW("Read platform version error!"); return FAILED;); diff --git a/inc/graph/anchor.h b/inc/graph/anchor.h index 5321fe47..565f0843 100644 --- a/inc/graph/anchor.h +++ b/inc/graph/anchor.h @@ -20,13 +20,17 @@ #include #include #include - #include "graph/ge_error_codes.h" #include "graph/range_vistor.h" #include "graph/types.h" namespace ge { -enum AnchorStatus { ANCHOR_SUSPEND = 0, ANCHOR_CONST = 1, ANCHOR_DATA = 2, ANCHOR_RESERVED = 3 }; +enum AnchorStatus { + ANCHOR_SUSPEND = 0, // dat null + ANCHOR_CONST = 1, + ANCHOR_DATA = 2, // Effective + ANCHOR_RESERVED = 3 +}; using std::string; using std::vector; @@ -81,17 +85,19 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Anchor : public std::enable virtual ~Anchor() = default; protected: - // Whether the two anchors are equal + // Whether the two anchor is equal virtual bool Equal(AnchorPtr anchor) const = 0; virtual bool IsTypeOf(TYPE type) const; public: // Get all peer anchors connected to current anchor Vistor GetPeerAnchors() const; - // Get the first peer anchor + // Get peer anchor size + size_t GetPeerAnchorsSize() const; + // Get first peer anchor AnchorPtr GetFirstPeerAnchor() const; - // Get the node which is the owner of the anchor + // Get the anchor belong to which node NodePtr GetOwnerNode() const; // Remove all links with the anchor @@ -100,22 +106,22 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Anchor : public std::enable // Remove link with the given anchor graphStatus Unlink(const AnchorPtr &peer); - // Replace the peeranchor with the new peeranchor + // Replace peer with new peers graphStatus ReplacePeer(const AnchorPtr &oldPeer, const AnchorPtr &firstPeer, const AnchorPtr &secondPeer); // Judge if the anchor is linked with the given anchor bool IsLinkedWith(const AnchorPtr &peer); - // Get the anchor index of the node + // Get anchor index of the node int GetIdx() const; - // Set the anchor index of the node + // set anchor index of the node void SetIdx(int index); protected: // All peer anchors connected to current anchor vector> peer_anchors_; - // The owner nodes of the anchor + // The owner node of anchor std::weak_ptr owner_node_; // The index of current anchor int idx_; @@ -167,7 +173,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY InDataAnchor : public DataA virtual ~InDataAnchor() = default; - // Get source out data anchor + // Get source out data anchor OutDataAnchorPtr GetPeerOutAnchor() const; // Build connection from OutDataAnchor to InDataAnchor diff --git a/inc/graph/attr_value_serializable.h b/inc/graph/attr_value_serializable.h index 7570c73d..2b2a7733 100644 --- a/inc/graph/attr_value_serializable.h +++ b/inc/graph/attr_value_serializable.h @@ -19,10 +19,10 @@ #include #include - #include "graph/ge_attr_value.h" namespace ge { + class GeAttrValue; class _GeSerializable { public: @@ -107,7 +107,6 @@ class _GeSerializable { static graphStatus LoadItem(GeAttrValue::NamedAttrs &namedAttrs __attribute__((__unused__))) { return GRAPH_SUCCESS; } }; - #define _GE_FI(a) #a, a #define _GE_MAP_FIELDS1(a1) _GE_FI(a1) #define _GE_MAP_FIELDS2(a1, a2) _GE_FI(a1), _GE_FI(a2) @@ -130,23 +129,23 @@ class _GeSerializable { #define _GE_MAP_FIELDS11(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ _GE_FI(a1) \ , _GE_FI(a2), _GE_FI(a3), _GE_FI(a4), _GE_FI(a5), _GE_FI(a6), _GE_FI(a7), _GE_FI(a8), _GE_FI(a9), _GE_FI(a10), \ - _GE_FI(a11) + _GE_FI(a11) #define _GE_MAP_FIELDS12(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ _GE_FI(a1) \ , _GE_FI(a2), _GE_FI(a3), _GE_FI(a4), _GE_FI(a5), _GE_FI(a6), _GE_FI(a7), _GE_FI(a8), _GE_FI(a9), _GE_FI(a10), \ - _GE_FI(a11), _GE_FI(a12) + _GE_FI(a11), _GE_FI(a12) #define _GE_MAP_FIELDS13(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ _GE_FI(a1) \ , _GE_FI(a2), _GE_FI(a3), _GE_FI(a4), _GE_FI(a5), _GE_FI(a6), _GE_FI(a7), _GE_FI(a8), _GE_FI(a9), _GE_FI(a10), \ - _GE_FI(a11), _GE_FI(a12), _GE_FI(a13) + _GE_FI(a11), _GE_FI(a12), _GE_FI(a13) #define _GE_MAP_FIELDS14(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14) \ _GE_FI(a1) \ , _GE_FI(a2), _GE_FI(a3), _GE_FI(a4), _GE_FI(a5), _GE_FI(a6), _GE_FI(a7), _GE_FI(a8), _GE_FI(a9), _GE_FI(a10), \ - _GE_FI(a11), _GE_FI(a12), _GE_FI(a13), _GE_FI(a14) + _GE_FI(a11), _GE_FI(a12), _GE_FI(a13), _GE_FI(a14) #define _GE_MAP_FIELDS15(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15) \ _GE_FI(a1) \ , _GE_FI(a2), _GE_FI(a3), _GE_FI(a4), _GE_FI(a5), _GE_FI(a6), _GE_FI(a7), _GE_FI(a8), _GE_FI(a9), _GE_FI(a10), \ - _GE_FI(a11), _GE_FI(a12), _GE_FI(a13), _GE_FI(a14), _GE_FI(a15) + _GE_FI(a11), _GE_FI(a12), _GE_FI(a13), _GE_FI(a14), _GE_FI(a15) #define _GE_PRIVATE_ARGS_GLUE(x, y) x y diff --git a/inc/graph/buffer.h b/inc/graph/buffer.h index d781fe0b..e6be3daa 100644 --- a/inc/graph/buffer.h +++ b/inc/graph/buffer.h @@ -17,12 +17,11 @@ #ifndef INC_GRAPH_BUFFER_H_ #define INC_GRAPH_BUFFER_H_ +#include #include #include #include - #include "detail/attributes_holder.h" -#include "graph/types.h" namespace ge { #ifdef HOST_VISIBILITY @@ -72,7 +71,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Buffer { GeIrProtoHelper data_; std::string *buffer_ = nullptr; - // Create buffer from protobuf obj + // Create from protobuf obj Buffer(const ProtoMsgOwner &protoOnwer, proto::AttrDef *buffer); Buffer(const ProtoMsgOwner &protoOnwer, std::string *buffer); diff --git a/inc/graph/compute_graph.h b/inc/graph/compute_graph.h index 5cf02dd2..c63494f8 100644 --- a/inc/graph/compute_graph.h +++ b/inc/graph/compute_graph.h @@ -17,7 +17,6 @@ #ifndef INC_GRAPH_COMPUTE_GRAPH_H_ #define INC_GRAPH_COMPUTE_GRAPH_H_ -#include #include #include #include @@ -63,7 +62,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A using Vistor = RangeVistor>; explicit ComputeGraph(const std::string &name); - virtual ~ComputeGraph(); + ~ComputeGraph() override; std::string GetName() const; void SetName(const std::string &name); @@ -81,7 +80,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A Vistor GetOutputNodes() const; NodePtr FindNode(const std::string &name) const; - // Add node + // AddNode with NodePtr NodePtr AddNode(NodePtr node); NodePtr AddNode(OpDescPtr op); NodePtr AddNodeFront(NodePtr node); @@ -94,9 +93,40 @@ class ComputeGraph : public std::enable_shared_from_this, public A graphStatus RemoveOutputNode(const NodePtr &node); graphStatus RemoveConstInput(const NodePtr &node); + /// Add a subgraph to this graph. The subgraph must has a parent graph and parent node, + /// which means the member functions `SetParentGraph` and `SetParentNode` of the subgraph + /// must be called before add it to the root graph. and subgraph->GetParentNode()->GetOwnerGraph() + /// must equal to subgraph->GetOwnerGraph(). + /// The subgraphs can only be added to a *root graph*. A root graph is a graph without any parent graph. + /// The subgraph's name SHOULD(not must) be the same as the parameter `name` + graphStatus AddSubgraph(const std::string &name, const std::shared_ptr &subgraph); + graphStatus AddSubgraph(const std::shared_ptr &subgraph); + + void RemoveSubgraph(const std::string &name); + void RemoveSubgraph(const std::shared_ptr &subgraph); + + std::shared_ptr GetSubgraph(const std::string &name) const; + std::vector> GetAllSubgraphs() const; + + // obsolete std::shared_ptr AddSubGraph(std::shared_ptr sub_graph); + // obsolete graphStatus RemoveSubGraph(const std::shared_ptr &sub_graph); + /// + /// @brief Update input-mapping + /// @param [in] input_mapping : index_of_cur_graph_node_input -> index_of_new_graph_node_input + /// @return graphStatus + /// + graphStatus UpdateInputMapping(const std::map &input_mapping); + + /// + /// @brief Update output-mapping + /// @param [in] output_mapping : index_of_cur_graph_node_output -> index_of_new_graph_node_output + /// @return graphStatus + /// + graphStatus UpdateOutputMapping(const std::map &output_mapping); + graphStatus TopologicalSorting(); bool IsValid() const; void Dump() const; @@ -127,6 +157,11 @@ class ComputeGraph : public std::enable_shared_from_this, public A } } + shared_ptr GetParentGraph(); + void SetParentGraph(const shared_ptr &parent); + shared_ptr GetParentNode(); + void SetParentNode(const shared_ptr &parent); + const std::map> &GetGraphOutNodes() const { return out_nodes_map_; } void SetOrigGraph(ComputeGraphPtr orig_graph) { origGraph_ = orig_graph; } @@ -138,8 +173,8 @@ class ComputeGraph : public std::enable_shared_from_this, public A uint32_t GetInputSize() const { return input_size_; } /// - /// Set iteration needed. - /// If set is true, it means this graph need run iteration some + /// Set is need train iteration. + /// If set true, it means this graph need to be run iteration some /// times(according variant "npu_runconfig/iterations_per_loop"). /// @param need_iteration is need iteration /// @@ -150,7 +185,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A const std::string GetOutput(); /// - /// Get need_iteration. + /// Get is need train iteration. /// @return is need iteration /// bool GetNeedIteration() const { return need_iteration_; } @@ -201,6 +236,7 @@ class ComputeGraph : public std::enable_shared_from_this, public A std::deque &stack); graphStatus CollectBreadthOutNode(const NodePtr &node, std::map &map_in_edge_num, std::map &breadth_node_map); + graphStatus TopologicalSortingSubgraph(); graphStatus SortNodes(std::vector &stack, std::map &mapInEdgeNum); size_t GetInEdgeSize(const NodePtr &node); size_t GetOutEdgeSize(const NodePtr &node); @@ -210,31 +246,38 @@ class ComputeGraph : public std::enable_shared_from_this, public A bool VectorInputNodePtrIsEqual(const std::vector &r_node_ptr_vector, const std::vector &l_node_ptr_vector) const; - ProtoAttrMapHelper attrs_; - friend class ModelSerializeImp; friend class GraphDebugImp; friend class OnnxUtils; + + std::string name_; + uint32_t graph_id_ = 0; + ProtoAttrMapHelper attrs_; std::vector nodes_; + std::map all_nodes_infos_; + std::vector target_nodes_info_; + std::vector input_nodes_; + std::vector inputs_order_; + uint32_t input_size_ = 1; + std::map> out_nodes_map_; + uint32_t output_size_ = 1; + std::vector> output_nodes_info_; + std::vector> sub_graph_; - std::string name_; + std::map> names_to_subgraph_; + std::weak_ptr parent_graph_; + std::weak_ptr parent_node_; + + // the members followed should not in the ComputeGraph class bool is_valid_flag_; bool is_summary_graph_ = false; // Indicates whether it is need iteration bool need_iteration_ = false; std::map, std::vector> params_share_map_; - std::map> out_nodes_map_; // TaskIdx -> op_name Map std::map op_name_map_; - std::vector inputs_order_; - uint32_t output_size_ = 1; - uint32_t input_size_ = 1; - std::map all_nodes_infos_; - std::vector> output_nodes_info_; - std::vector target_nodes_info_; uint64_t session_id_ = 0; - uint32_t graph_id_ = 0; ge::Format data_format_ = ge::FORMAT_ND; }; } // namespace ge diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h index 6476d07f..00f5edbd 100644 --- a/inc/graph/debug/ge_attr_define.h +++ b/inc/graph/debug/ge_attr_define.h @@ -18,7 +18,6 @@ #define INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_ #include - #include "graph/types.h" namespace ge { @@ -59,6 +58,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_BIAS_TERM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_HAS_BIAS_VALUE; + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PAD; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PADS; @@ -75,8 +76,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_CEIL_MODE; -// GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string -// ATTR_NAME_WEIGHTS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STRIDE_SIZE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_RELUMODE; @@ -124,6 +124,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NAN_OPT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_AIPP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string NEW_AIPP_CONV_OP; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_MULTISHAPE_BATCHLIST; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_MULTISHAPE_BATCHLIST_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_BATCH_NUM; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_FORMAT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_FORMAT; @@ -141,10 +148,15 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PRED_PERMUTE_DELETED; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IGNORE_PRED_FORMAT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WEIGHTS; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WEIGHTS_DATA; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_BROACAST_REAL_DIM_CNT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DIM_ALIGN; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_CYCLE_EVENT_FLAG; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_RTSWITCH_RECV_EVENT_ID; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_AUTOMIC_ADD_START; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_AUTOMIC_ADD_MEM_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WEIGHTS_DATA; + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_BATCH_NUM; @@ -166,15 +178,15 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_MBOX GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_MBOX_FUSION_BOX_TYPE_NUM; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_RESHAPE_SLICE_CONCAT_FUSION; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_PRIORBOX_CONCAT; - GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_MBOX_LOC_FUSION; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_RESHAPE_SLICE_CONCAT_FUSION; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_MBOX_CONF_FUSION; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_MBOX_FUSION_BOX_TYPE_NUM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_RESHAPE_SLICE_CONCAT_FUSION; + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_PRIOR_BOX_ATTR_VARIANCE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REFINEDET_PRIOR_BOX_ATTR_VARIANCE_NUM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_PRIORBOX_CONCAT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string NEED_INFER; // _Arg @@ -263,7 +275,29 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNOR GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNORM_ATTR_ESTIMATED_VARIANCE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNORM_ATTR_SCALE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNORM_ATTR_BIAS; - +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNORM_ATTR_DATA_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNORM_ATTR_IS_TRAINING; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCHNORM_ATTR_IS_TRAINING_FUSION; + +// Huberloss +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HUBER_LOSS_ATTR_DELTA; + +// SSDRealDivTileMul +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_REAL_DIV_TILE_MUL_ATTR_TILE_PARA; + +// SSDSumMulRealDivMean +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_REDUCTION_INDICES; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_AXIS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_MEAN_PARA; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_HAS_SUM; +/// ConcatFive2Four +/// ConcatFour2Five +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_CLASS_NUM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_FEATURE_MAP_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string TRANS_FOR_LOSS_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_BOX_TYPE_NUM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_FEATURE_MAP_HIGH; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_FEATURE_MAP_WIDTH; // Scale GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SCALE_ATTR_SCALE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SCALE_ATTR_BIAS; @@ -300,7 +334,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string CONST_AT GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string CONST_ATTR_NAME_OUTPUT_FORMAT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string CONST_ATTR_NAME_OUTPUT_TYPE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEPTH_SPACE_ATTR_BLOCK_SIZE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string L2_NORMALIZE_ATTR_EPS; // Roipooling @@ -313,6 +346,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ROIPOOLI // DetectionOutput GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DETECTIONOUTPUT_ATTR_NUM_CLASSES; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DETECTIONOUTPUT_ATTR_OCR_NUM_CLASSES; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DETECTIONOUTPUT_ATTR_NMS_THRESHOLD; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DETECTIONOUTPUT_ATTR_TOP_K; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DETECTIONOUTPUT_ATTR_CONFIDENCE_THRESHOLD; @@ -371,6 +405,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SOFTMAX_ // Permute GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PERMUTE_ATTR_ORDER; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PERMUTE_ATTR_PERM; // SSD Normalize GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSDNORMALIZE_ATTR_ACCROSS_SPATIAL; @@ -411,9 +446,15 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POWER_AT GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POWER_ATTR_NAME_SCALE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POWER_ATTR_NAME_SHIFT; +// Log +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LOG_ATTR_NAME_SCALE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LOG_ATTR_NAME_SHIFT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LOG_ATTR_NAME_BASE; // Pack GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PACK_ATTR_NAME_NUM; +// Dynamic stitch +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DYNAMIC_STITCH_ATTR_NAME_NUM; // Unpack GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string UNPACK_ATTR_NAME_NUM; // Gathernd @@ -422,8 +463,16 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string GATHERND // Argmax GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_TOPK; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_REDUCESIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_REDUCESTRIDE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_OUTMAX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_AXIS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_AXISTYPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ARGMAX_ATTR_NAME_KEEPDIMS; +// Upsample +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string UPSAMPLE_ATTR_NAME_SCALE_H; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string UPSAMPLE_ATTR_NAME_SCALE_W; // Relu GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NEGATIVE_SLOPE; @@ -439,6 +488,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SPLIT_AT GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string TVM_ATTR_NAME_MAGIC; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string TVM_ATTR_NAME_BLOCKDIM; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string TVM_ATTR_NAME_METADATA; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string TVM_ATTR_NAME_WORKSPACE_TYPE; // Squeeze GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SQUEEZE_ATTR_AXIS; @@ -461,6 +511,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ROIALIGN GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ROIALIGN_ATTR_SAMPLING_RATIO; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ROIALIGN_ATTR_NAME_POOLED_H; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ROIALIGN_ATTR_NAME_POOLED_W; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ROIALIGN_ATTR_NAME_TF; // Generate_rpn_proposal GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string GENERATE_RPN_PROPOSAL_ATTR_PRE_NMS_TOPK; @@ -493,6 +544,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REORG_AT GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MERGE_DEAD_INDEX; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MERGE_PRENODE_FLAG; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string TO_BE_OUTPUT; +static const std::string NOT_NET_OUTPUT = "not_net_output"; // ENTER GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ENTER_ATTR_FRAME_NAME; @@ -518,6 +570,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RESIZE_B GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RESIZE_BILINEAR_ATTR_ALPHA; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RESIZE_BILINEAR_ATTR_BETA; +// RetinaNet +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RETINANET_FILTER_BACKGROUND_TRUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RETINANET_ANCHOR_FUSION; // MatMul GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MATMUL_TRANSPOSE_X; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MATMUL_TRANSPOSE_W; @@ -566,10 +621,30 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string GRU_CELL GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RNN_HT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RNN_XT_HT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RNN_BATCH_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_CELL_CLIP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_PROJ_CLIP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_ACTIVATE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_OUT_MAP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_OUT_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_STATE_OUT_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_TIME_MAJOR; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSTM_IS_INPUT_PRE_PROCESS; // Upsample GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string UPSAMPLE_ATTR_NAME_SCALE; +// PadV2 +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PADV2_ATTR_NAME_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PADV2_ATTR_NAME_PADS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PADV2_ATTR_NAME_T; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PADV2_ATTR_NAME_PAD_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PADV2_ATTR_NAME_CONST_VALUE; + +// MirrorPad +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MIRRORPAD_ATTR_NAME_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MIRRORPAD_ATTR_NAME_PADS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MIRRORPAD_ATTR_NAME_PAD_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MIRRORPAD_ATTR_NAME_CONST_VALUE; // Filler GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string FILLER_TYPE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string FILLER_VALUE; @@ -590,36 +665,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string PAD_LEFT GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_ALGO_ATTR; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SCALE_TYPE_ATTR; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_MODE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_DATA_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_DATA_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_WEIGHT_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_WEIGHT_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_PAD_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_PAD_OFFSET; - -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_SCALE_MODE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_SCALE_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_SCALE_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_DATA_TYPE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_DATA_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_WEIGHT_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_WEIGHT_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_PAD_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_PAD_OFFSET; - -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_SCALE_MODE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_SCALE_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_SCALE_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_DATA_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_DATA_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_WEIGHT_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_WEIGHT_OFFSET; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_PAD_VALUE; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_PAD_OFFSET; - GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IS_CONST; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_GROUP; @@ -634,6 +679,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MOD GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_EVENT_NUM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_LABEL_NUM; + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_MEMORY_SIZE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_WEIGHT_SIZE; @@ -642,12 +689,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MOD GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_GEN_WEIGHT_ADDR; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_GEN_VAR_ADDR; - -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_VAR_SIZE; - -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_INDEX_OP_NAME; - // Public attribute GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IMPLY_TYPE; @@ -685,11 +726,178 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_REFERENCE; +// Used for operators that do not generate task +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NOTASK; + +// Used for operators that output reuse input +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_REUSE_INPUT; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NOPADDING_CONTINUOUS_INPUT; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT; + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ATOMIC_INDEX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_GEN_VAR_ADDR; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_LABEL; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_VAR_SIZE; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_INDEX_OP_NAME; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_CORE_TYPE; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_SCALE_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_DATA_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_DATA_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_WEIGHT_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_WEIGHT_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_PAD_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string QUANTIZE_OFFSET_PAD_OFFSET; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_SCALE_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_SCALE_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_SCALE_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_DATA_TYPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_DATA_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_WEIGHT_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_WEIGHT_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_PAD_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEQUANTIZE_OFFSET_PAD_OFFSET; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_SCALE_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_SCALE_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_SCALE_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_DATA_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_DATA_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_WEIGHT_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_WEIGHT_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_PAD_VALUE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REQUANTIZE_OFFSET_PAD_OFFSET; + +// L2_normalize +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string L2_NORMALIZE_ATTR_AXIS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string L2_NORMALIZE_ATTR_EPS; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POOL_PARAMA_ATTR_WINDOW; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POOL_PARAMA_ATTR_CEIL_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POOL_PARAMA_ATTR_DATA_MODE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POOL_PARAMA_ATTR_GLOBAL_POOLING; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POOL_PARAMA_ATTR_NAN_OP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string POOL_PARAMA_ATTR_PAD_MOD; +// HCOM +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_ROOT_RANK; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_REDUCE_TYPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_RANK_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_REDUCTION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_GROUP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_SR_TAG; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_SRC_RANK; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_DEST_RANK; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_FUSION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_SHAPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HCOM_ATTR_DATA_TYPE; + +// Log time stamp +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LOG_TIME_STAMP_LOGID; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LOG_TIME_STAMP_NOTIFY; +// SpaceToDepth/DepthToSpace +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_BLOCK_SIZE; + +// SparseSoftmaxCrossEntropyWithLogits +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SPARSE_SOFT_MAX_ATTR_TLABLES; + +// MaxPoolGradWithArgmax +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string MAX_POOL_GRAD_OUTPUT_SHAPE; + +// AvgPoolGrad +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string AVG_POOL_GRAD_OUTPUT_SHAPE; + +// Varible +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_NAME; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_FRACTALZ_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_4D_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_5D_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_DATA_TYPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_IN_NAME; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_IN_INDEX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_OUT_INDEX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SHAPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string HALF_VAR_NAME_END; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_CONTAINER; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SHARED_NAME; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_DTYPE; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_ADDR_OFFSET; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_IN_INDEX_KEY; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_OUT_INDEX_KEY; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SRC_VAR_NAME; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_IS_SAVE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_IS_RESTORE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_VAR_IS_BROADCAST; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_SRC_VAR_NAME; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_PRE_PEER_OUT_INDEX; + +// Assign +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ASSIGN_VALIDATE_SHAPE; + +// ShapeN +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SHAPEN_ATTR_N; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SHAPEN_ATTR_IN_TYPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SHAPEN_ATTR_OUT_TYPE; + +// Space2bacth batch2space +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCH_SPACE_ATTR_BLOCK; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string BATCH_SPACE_ATTR_PADDING; +// Depth_to_space space_to_depth +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DEPTH_SPACE_ATTR_BLOCK_SIZE; +// FakeQuantWithMinMaxVars +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string FakeQuantWithMinMaxVars_ATTR_MAX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string FakeQuantWithMinMaxVars_ATTR_MIN; +// Mobilenet_ssd_conv_fusion +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_BOXPREDICTOR_BOXES_FUSION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_BOXPREDICTOR_SCORES_FUSION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string SSD_BOXPREDICTOR_FUSION_BOX_TYPE_NUM; + +// Lsh project +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string LSH_PROJ_TYPE; + +// Control flow +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ITERATORS_PER_LOOP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TRUE_BRANCH_STREAM; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FLOW_CTRL_NODE_FLAG; + +// GatherV2 attr def +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string GATHERV2_ATTR_NAME_TAXIS; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string GATHERV2_ATTR_NAME_TINDICES; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string GATHERV2_ATTR_NAME_TPARAMS; + +// Reshape attr def +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RESHAPE_ATTR_NAME_INPUT_DESC; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string RESHAPE_ATTR_NAME_OUTPUT_DESC; + +// Axis attr def +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_AXIS_ORG_OP; +// The node link with SparseSoftmaxCrossEntropyWithLogits +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LINK_WITH_SPARE; + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NET_OUTPUT_FORMAT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NET_OUTPUT_DATATYPE; +// For constant folding +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NO_NEED_CONSTANT_FOLDING; + // Used for mark the active label list to find stream of activated node GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ACTIVE_LABEL_LIST; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE; + // Multi batch GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PRED_VALUE; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_BATCH_NUM; @@ -697,7 +905,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM // Control flow GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_SWITCH_COND; -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_TRUE_BRANCH_STREAM; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ACTIVE_STREAM_LIST; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCHN_PRED_VALUE; @@ -709,6 +916,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_NEXT_ITERATION; +// Function Op +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_NODE_INDEX; + // Used for mark the active node is for loop, type:bool GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IS_LOOP_ACTIVE; @@ -742,6 +952,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_INS // For inserted op GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_INSERTED_BY_GE; +// For compress weight +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_COMPRESS_WEIGHT; + // For data dump GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DATA_DUMP_IS_MULTIOP; @@ -752,6 +965,23 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DATA_DUMP_ORIGIN_FORMAT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DATA_DUMP_ORIGIN_DATA_TYPE; +// used for l1 fusion and other fusion in future +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L1_FUSION_GROUP_ID; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L1_FUSION_GROUP_KEY; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSION_VIRTUAL_OP; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FUSION_GROUP_TYPE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_INPUT_MEM_TYPE_LIST; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_MEM_TYPE_LIST; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_L1_FUSION_EXTEND_PTR; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_GET_TENSOR_ACTUAL_SIZE; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_OUTPUT_OFFSET_FOR_L1_FUSION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCH_FOR_L1_FUSION; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_N_BATCH_SPILT; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NO_TASK_AND_DUMP_NEEDED; + +// used for label switch +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_INDEX; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_LIST; // Varible GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_SRC_VAR_NAME; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SRC_VAR_NAME; diff --git a/inc/graph/def_types.h b/inc/graph/def_types.h index 706e7f51..6d70fb18 100644 --- a/inc/graph/def_types.h +++ b/inc/graph/def_types.h @@ -20,10 +20,8 @@ #include #include #include - #include "graph/attr_value_serializable.h" #include "graph/buffer.h" - namespace ge { #define DEF_TYPE_DEC(type, name) \ inline void set_##name(const type &value) { name = value; } \ @@ -49,10 +47,9 @@ namespace ge { inline void add_##name(type value) { name.push_back(value); } \ inline std::vector *mutable_##name() { return &name; } -#define DEF_TYPE_BYTES_DEC(name) \ - inline void clear_##name() { name.ClearBuffer(); } \ - inline void set_##name(const void *value, size_t size) { \ - name = Buffer::CopyFrom((const uint8_t *)(value), size); } \ +#define DEF_TYPE_BYTES_DEC(name) \ + inline void clear_##name() { name.ClearBuffer(); } \ + inline void set_##name(const void *value, size_t size) { name = Buffer::CopyFrom((const uint8_t *)(value), size); } \ inline Buffer *mutable_##name() { return &name; } struct CompressInfo { diff --git a/inc/graph/detail/attributes_holder.h b/inc/graph/detail/attributes_holder.h index 77903b30..bb26dec5 100644 --- a/inc/graph/detail/attributes_holder.h +++ b/inc/graph/detail/attributes_holder.h @@ -23,7 +23,6 @@ #include #include #include - #include "graph/detail/any_map.h" #include "graph/ge_error_codes.h" #include "graph/types.h" @@ -96,7 +95,7 @@ class GeIrProtoHelper { } } - // protoMsg_ is part of protoOwner_ and they have the same runtime + // protoMsg_ is part of protoOwner_, they have the same runtime ProtoMsgOwner protoOwner_ = nullptr; ProtoType *protoMsg_ = nullptr; friend class GeIrProtoHelper #include #include - #include "graph/anchor.h" -#include "graph/model.h" #include "detail/attributes_holder.h" #include "graph/ge_tensor.h" #include "graph/graph.h" @@ -48,15 +46,15 @@ struct NodeNameNodeReq { class ModelSerializeImp { public: - bool SerializeModel(const Model &model, proto::ModelDef *modeProto); + bool SerializeModel(const Model &model, proto::ModelDef *modeProto, bool is_dump = false); - bool SerializeGraph(const ConstComputeGraphPtr &graph, proto::GraphDef *graphProto); + bool SerializeGraph(const ConstComputeGraphPtr &graph, proto::GraphDef *graphProto, bool is_dump = false); bool SerializeEdge(const NodePtr &node, proto::OpDef *opDefProto); - bool SerializeOpDesc(const ConstOpDescPtr &node, proto::OpDef *opDefProto); + bool SerializeOpDesc(const ConstOpDescPtr &node, proto::OpDef *opDefProto, bool is_dump = false); - bool SerializeNode(const NodePtr &node, proto::OpDef *opDefProto); + bool SerializeNode(const NodePtr &node, proto::OpDef *opDefProto, bool is_dump = false); bool SerializeTensor(const ConstGeTensorPtr &tensor, proto::TensorDef *tensorProto); diff --git a/inc/graph/ge_attr_value.h b/inc/graph/ge_attr_value.h index 11da6fae..c5186fd1 100644 --- a/inc/graph/ge_attr_value.h +++ b/inc/graph/ge_attr_value.h @@ -23,7 +23,6 @@ #include #include #include - #include "graph/buffer.h" #include "detail/attributes_holder.h" #include "graph/ge_error_codes.h" @@ -139,15 +138,14 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeAttrValue { template // To cols - using enable_if_vector_type_valid_t = typename std::enable_if::LIST_VALUE, - int>::type; + using enable_if_vector_type_valid_t = typename std::enable_if::LIST_VALUE, int>::type; template using enable_if_one_type_valid_t = typename std::enable_if::VALUE, int>::type; template using enable_if_type_valid_t = - typename std::enable_if::VALUE || IsAttrTypeEnable::LIST_VALUE, int>::type; + typename std::enable_if::VALUE || IsAttrTypeEnable::LIST_VALUE, int>::type; template using enable_if_seriliable_type_valid_t = typename seriliable_type::__ge_serializable; diff --git a/inc/graph/ge_context.h b/inc/graph/ge_context.h index f35e09ec..b1ccd5b9 100644 --- a/inc/graph/ge_context.h +++ b/inc/graph/ge_context.h @@ -18,7 +18,6 @@ #define INC_GRAPH_GE_CONTEXT_H_ #include - #include "graph/ge_error_codes.h" namespace ge { @@ -42,4 +41,4 @@ class GEContext { GEContext &GetContext(); } // namespace ge -#endif // INC_GRAPH_GE_CONTEXT_H_ +#endif // INC_GRAPH_GE_CONTEXT_H_ diff --git a/inc/graph/ge_local_context.h b/inc/graph/ge_local_context.h index b87c10b7..b47098fb 100644 --- a/inc/graph/ge_local_context.h +++ b/inc/graph/ge_local_context.h @@ -20,7 +20,6 @@ #include #include #include - #include "graph/ge_error_codes.h" using std::map; @@ -42,5 +41,4 @@ class GEThreadLocalContext { GEThreadLocalContext &GetThreadLocalContext(); } // namespace ge - #endif // INC_GRAPH_GE_LOCAL_CONTEXT_H_ diff --git a/inc/graph/ge_tensor.h b/inc/graph/ge_tensor.h index 78534438..7a3eed68 100644 --- a/inc/graph/ge_tensor.h +++ b/inc/graph/ge_tensor.h @@ -21,12 +21,10 @@ #include #include #include - #include "detail/attributes_holder.h" #include "graph/buffer.h" #include "graph/ge_error_codes.h" #include "graph/types.h" - namespace ge { class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeShape { public: @@ -43,6 +41,18 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeShape { int64_t GetShapeSize() const; std::string ToString() const; + /// + /// @brief Check is unknown shape + /// @return bool + /// + bool IsUnknownShape() const; + + /// + /// @brief Check is a scalar + /// @return bool + /// + bool IsScalar() const; + GeShape(const GeShape &other); GeShape(GeShape &&other); GeShape &operator=(const GeShape &other); @@ -51,7 +61,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeShape { private: GeIrProtoHelper shape_def_; friend class GeTensorDesc; - // Create geshape from proto obj + // Create from proto obj GeShape(const ProtoMsgOwner &protoOnwer, proto::ShapeDef *protoMsg); void RefTo(const GeShape &shape) { shape_def_ = shape.shape_def_; } @@ -112,7 +122,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeTensorDesc : public AttrH void Init(); - // Create getensordesc from proto obj + // Create from proto obj GeTensorDesc(const ProtoMsgOwner &protoOnwer, proto::TensorDescriptor *protoMsg); friend class GeTensor; friend class GeAttrValueImp; @@ -159,10 +169,10 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeTensor { friend class GeAttrValueImp; friend class ModelSerializeImp; friend class OnnxUtils; - // Create getensor from proto obj + // Create from proto obj GeTensor(const ProtoMsgOwner &protoOnwer, proto::TensorDef *protoMsg); GeIrProtoHelper tensor_def_; - // Reference from tensorDef_, cab not use it directly + // Reference from tensorDef_, do not direct use mutable GeTensorDesc __desc_; GeTensorDesc &DescReference() const; }; diff --git a/inc/graph/model.h b/inc/graph/model.h index 02510d8f..38ea501b 100644 --- a/inc/graph/model.h +++ b/inc/graph/model.h @@ -21,7 +21,6 @@ #include #include #include - #include "detail/attributes_holder.h" #include "graph/ge_attr_value.h" #include "graph/graph.h" @@ -62,7 +61,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Model : public AttrHolder { using AttrHolder::HasAttr; using AttrHolder::SetAttr; - graphStatus Save(Buffer &buffer) const; + graphStatus Save(Buffer &buffer, bool is_dump = false) const; graphStatus SaveToFile(const string &file_name) const; // Model will be rewrite diff --git a/inc/graph/model_serialize.h b/inc/graph/model_serialize.h index 7f354388..3f7d65a9 100644 --- a/inc/graph/model_serialize.h +++ b/inc/graph/model_serialize.h @@ -19,7 +19,6 @@ #include #include - #include "graph/buffer.h" #include "graph/compute_graph.h" #include "graph/model.h" @@ -27,7 +26,7 @@ namespace ge { class ModelSerialize { public: - Buffer SerializeModel(const Model &model); + Buffer SerializeModel(const Model &model, bool is_dump = false); Model UnserializeModel(const uint8_t *data, size_t len); Model UnserializeModel(ge::proto::ModelDef &model_def); diff --git a/inc/graph/node.h b/inc/graph/node.h index 2785b0b9..74aaf72f 100644 --- a/inc/graph/node.h +++ b/inc/graph/node.h @@ -113,25 +113,25 @@ class Node : public std::enable_shared_from_this { bool IsAllInNodesSeen(std::unordered_set &nodes_seen) const; - // All inData nodes + // All in Data nodes Vistor GetInDataNodes() const; - // All inControl nodes + // All in Control nodes Vistor GetInControlNodes() const; // GetInAllNodes = InDataNodes + InControlNodes Vistor GetInAllNodes() const; - // All outData nodes + // All out Data nodes Vistor GetOutDataNodes() const; uint32_t GetOutDataNodesSize() const; - // All outControl nodes + // All out Control nodes Vistor GetOutControlNodes() const; // GetOutAllNodes = OutDataNodes + InControlNodes Vistor GetOutAllNodes() const; - // Get all indata nodes and its outanchor + // Get all in data nodes and its out-anchor Vistor> GetInDataNodesAndAnchors() const; - // Get all outdata nodes and its inanchor + // Get all out data nodes and its in-anchor Vistor> GetOutDataNodesAndAnchors() const; graphStatus InferShapeAndType() const; @@ -176,7 +176,7 @@ class Node : public std::enable_shared_from_this { void SetOrigNode(const NodePtr &orignode) { orig_node_ = orignode; } - NodePtr GetOrigNode(void) { return orig_node_; } + NodePtr GetOrigNode() { return orig_node_; } private: bool NodeMembersAreEqual(const Node &r_node) const; diff --git a/inc/graph/op_desc.h b/inc/graph/op_desc.h index 9a07641b..ab59155e 100644 --- a/inc/graph/op_desc.h +++ b/inc/graph/op_desc.h @@ -23,7 +23,6 @@ #include #include #include - #include "detail/attributes_holder.h" #include "graph/range_vistor.h" @@ -108,6 +107,8 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { size_t GetInputsSize() const; + size_t GetAllInputsSize() const; + graphStatus AddOutputDesc(const GeTensorDesc &output_desc); graphStatus AddOutputDesc(const string &name, const GeTensorDesc &output_desc); @@ -122,6 +123,8 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { GeTensorDescPtr MutableOutputDesc(uint32_t index) const; + uint32_t GetAllOutputsDescSize() const; + Vistor GetAllOutputsDesc() const; Vistor GetAllOutputsDescPtr() const; @@ -132,6 +135,10 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { ConstGeTensorDescPtr GetInputDescPtr(uint32_t index) const; + ConstGeTensorDescPtr GetInputDescPtrDfault(uint32_t index) const; + + ConstGeTensorDescPtr GetInputDescPtr(const string &name) const; + graphStatus AddDynamicInputDesc(const string &name, const unsigned int num, bool isPushBack = true); graphStatus AddDynamicOutputDesc(const string &name, const unsigned int num, bool isPushBack = true); @@ -140,7 +147,11 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { bool IsOptionalInput(uint32_t index) const; - std::map GetAllInputName(); + std::map GetAllInputName() const; + + void SetAllInputName(const std::map &input_name_idx); + + std::vector GetAllOptionalInputName() const; std::map GetAllOutputName(); @@ -225,6 +236,14 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { std::string GetOpEngineName() const; + graphStatus AddSubgraphName(const std::string &name); + const std::map &GetSubgraphNameIndexes() const; + + std::string GetSubgraphInstanceName(uint32_t index) const; + const std::vector &GetSubgraphInstanceNames() const; + void AddSubgraphInstanceName(std::string name); + void RemoveSubgraphInstanceName(const std::string &name); + protected: ProtoAttrMapHelper MutableAttrMap() override; ConstProtoAttrMapHelper GetAttrMap() const override; @@ -236,9 +255,9 @@ class OpDesc : public std::enable_shared_from_this, public AttrHolder { bool OpDescGenTensorDescsAreEqual(const OpDesc &r_op_desc) const; GeIrProtoHelper op_def_; + std::vector subgraph_instance_names_; + std::map subgraph_names_to_index_; vector inputs_desc_{}; - map input_name_idx_{}; - std::unordered_set optional_input_names_{}; vector outputs_desc_{}; map output_name_idx_{}; std::function infer_func_ = nullptr; diff --git a/inc/graph/operator_factory_impl.h b/inc/graph/operator_factory_impl.h index 92d38583..ea343ebc 100644 --- a/inc/graph/operator_factory_impl.h +++ b/inc/graph/operator_factory_impl.h @@ -21,7 +21,6 @@ #include #include #include - #include "graph/operator_factory.h" namespace ge { @@ -47,7 +46,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OperatorFactoryImpl { static graphStatus RegisterVerifyFunc(const std::string &operator_type, VerifyFunc const verify_func); - private: static shared_ptr> operator_creators_; static shared_ptr> operator_infershape_funcs_; static shared_ptr> operator_inferformat_funcs_; diff --git a/inc/graph/shape_refiner.h b/inc/graph/shape_refiner.h index ef5b8aab..65664615 100644 --- a/inc/graph/shape_refiner.h +++ b/inc/graph/shape_refiner.h @@ -18,8 +18,8 @@ #define INC_GRAPH_SHAPE_REFINER_H_ #include - #include "external/graph/inference_context.h" + #include "external/graph/ge_error_codes.h" #include "graph/node.h" @@ -27,8 +27,10 @@ namespace ge { // ShapeRefiner performs shape inference for compute graphs class ShapeRefiner { public: - static graphStatus InferShapeAndType(const ConstNodePtr &node, Operator &op); + static graphStatus InferShapeAndType(const ConstNodePtr &node, Operator &op, bool before_subgraph); + static graphStatus InferShapeAndType(const NodePtr &node, bool before_subgraph); static graphStatus InferShapeAndType(const NodePtr &node); + static graphStatus InferShapeAndType(const ConstNodePtr &node, Operator &op); private: static void PrintInOutTensorShape(const ge::NodePtr &node, const std::string &phase); diff --git a/inc/graph/usr_types.h b/inc/graph/usr_types.h index 796a70a3..90e02001 100644 --- a/inc/graph/usr_types.h +++ b/inc/graph/usr_types.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef INC_EXTERNAL_GRAPH_USR_TYPES_H_ -#define INC_EXTERNAL_GRAPH_USR_TYPES_H_ +#ifndef INC_GRAPH_USR_TYPES_H_ +#define INC_GRAPH_USR_TYPES_H_ #include #include @@ -130,4 +130,4 @@ struct UsrQuantizeFactorParams { #undef USR_TYPE_BYTES_DEC } // namespace ge -#endif // INC_EXTERNAL_GRAPH_USR_TYPES_H_ +#endif // INC_GRAPH_USR_TYPES_H_ diff --git a/inc/graph/utils/attr_utils.h b/inc/graph/utils/attr_utils.h index 37dc79e9..ab89ebc7 100644 --- a/inc/graph/utils/attr_utils.h +++ b/inc/graph/utils/attr_utils.h @@ -99,8 +99,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY AttrUtils { static bool SetZeroCopyBytes(AttrHolderAdapter &&obj, const string &name, Buffer &&buffer); static bool GetZeroCopyBytes(ConstAttrHolderAdapter &&obj, const string &name, Buffer &buffer); // Value will be moved - static bool SetZeroCopyListBytes(AttrHolderAdapter &&obj, const string &name, - vector &listBuffer); + static bool SetZeroCopyListBytes(AttrHolderAdapter &&obj, const string &name, vector &listBuffer); static bool GetZeroCopyListBytes(ConstAttrHolderAdapter &&obj, const string &name, vector &listBuffer); static bool SetListListInt(AttrHolderAdapter &&obj, const string &name, const vector> &value); @@ -116,6 +115,8 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY AttrUtils { static OpDescPtr CopyOpDesc(const ConstOpDescPtr &orgOpDesc); + static std::string GetAllAttrsStr(ConstAttrHolderAdapter &&obj); + class AttrHolderAdapter { public: AttrHolderAdapter(AttrHolder *obj) : obj_(obj) {} diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h index 2d8f0fb9..8066e8b5 100644 --- a/inc/graph/utils/graph_utils.h +++ b/inc/graph/utils/graph_utils.h @@ -137,6 +137,18 @@ class GraphUtils { static graphStatus InsertTransNode(ComputeGraphPtr compute_graph, const InDataAnchorPtr &in_data_anchor, const std::vector &vec_op_desc); + /// + /// @brief Insert node: src->insert_node:input_index, insert_node:output_index->dst + /// @param [in] src + /// @param [in] dsts + /// @param [in] insert_node + /// @param [in] input_index + /// @param [in] output_index + /// @return graphStatus + /// + static graphStatus InsertNodeBefore(const OutDataAnchorPtr &src, const std::vector &dsts, + const NodePtr &insert_node, uint32_t input_index = 0, uint32_t output_index = 0); + static graphStatus RemoveJustNode(ComputeGraphPtr compute_graph, const NodePtr &node); static graphStatus RemoveJustNode(ComputeGraph &compute_graph, const NodePtr &node); @@ -145,16 +157,12 @@ class GraphUtils { static void RecordOriginalNames(std::vector names_tmp, const ge::NodePtr &node); - static bool CheckIsTrainGraph(const ge::ComputeGraphPtr &compute_graph); - static bool MatchDumpStr(const std::string &suffix); static void DumpGEGraph(const ge::ComputeGraphPtr &graph, const std::string &suffix, bool is_always_dump = false); static bool LoadGEGraph(const char *file, ge::ComputeGraph &compute_graph); - static bool CheckGlobalStepNode(const ge::NodePtr &node); - static void BreakConnect(const std::map &all_nodes_infos); static void DumpGEGraphToOnnx(const ge::ComputeGraph &compute_graph, const std::string &suffix); @@ -252,6 +260,315 @@ class GraphUtils { /// @return success: GRAPH_SUCESS /// static graphStatus MoveOutCtrlEdges(NodePtr &src_node, NodePtr &dst_node); + + static ComputeGraphPtr FindRootGraph(ComputeGraphPtr graph); +}; + +class ComputeGraphBuilder { + public: + ComputeGraphBuilder() : owner_graph_(nullptr) {} + ComputeGraphBuilder(const ComputeGraphBuilder &) = delete; + ComputeGraphBuilder &operator=(const ComputeGraphBuilder &) = delete; + ComputeGraphBuilder(const ComputeGraphBuilder &&) = delete; + ComputeGraphBuilder &operator=(const ComputeGraphBuilder &&) = delete; + ~ComputeGraphBuilder() = default; + + /// + /// @brief Add node to graph + /// @param [in] op_desc + /// @return ComputeGraphBuilder + /// + virtual ComputeGraphBuilder &AddNode(const OpDescPtr &op_desc); + + /// + /// @brief Add data-link among nodes in graph + /// @param [in] src_name + /// @param [in] out_anchor_ind + /// @param [in] dst_name + /// @param [in] in_anchor_ind + /// @return ComputeGraphBuilder + /// + virtual ComputeGraphBuilder &AddDataLink(const std::string &src_name, uint32_t out_anchor_ind, + const std::string &dst_name, uint32_t in_anchor_ind); + + /// + /// @brief Add ctrl-link among nodes in graph + /// @param [in] src_name + /// @param [in] dst_name + /// @return ComputeGraphBuilder + /// + virtual ComputeGraphBuilder &AddControlLink(const std::string &src_name, const std::string &dst_name); + + /// + /// @brief Build graph + /// @param [out] error_code + /// @param [out] error_msg + /// @return ComputeGraphPtr + /// + virtual ComputeGraphPtr Build(graphStatus &error_code, std::string &error_msg) = 0; + + /// @brief Get node with name + /// @param [in] name + /// @return NodePtr + /// + NodePtr GetNode(const std::string &name); + + protected: + /// + /// @brief Build nodes + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + void BuildNodes(graphStatus &error_code, std::string &error_msg); + + /// + /// @brief Build data-links + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + void BuildDataLinks(graphStatus &error_code, std::string &error_msg); + + /// + /// @brief Build ctrl-links + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + void BuildCtrlLinks(graphStatus &error_code, std::string &error_msg); + + ComputeGraphPtr owner_graph_; + + // node_name -> node + std::map node_names_; + std::vector nodes_; + + // -> + std::vector, std::pair>> data_links_; + // src_node_name -> dst_node_name + std::vector> ctrl_links_; +}; + +class CompleteGraphBuilder : public ComputeGraphBuilder { + public: + explicit CompleteGraphBuilder(std::string name) : name_(std::move(name)), parent_node_(nullptr) {} + CompleteGraphBuilder(const CompleteGraphBuilder &) = delete; + CompleteGraphBuilder &operator=(const CompleteGraphBuilder &) = delete; + CompleteGraphBuilder(const CompleteGraphBuilder &&) = delete; + CompleteGraphBuilder &operator=(const CompleteGraphBuilder &&) = delete; + ~CompleteGraphBuilder() = default; + + /// + /// @brief Add node to graph + /// @param [in] op_desc + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &AddNode(const OpDescPtr &op_desc) override; + + /// + /// @brief Add data-link among nodes in graph + /// @param [in] src_name + /// @param [in] out_anchor_ind + /// @param [in] dst_name + /// @param [in] in_anchor_ind + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &AddDataLink(const std::string &src_name, uint32_t out_anchor_ind, const std::string &dst_name, + uint32_t in_anchor_ind) override; + + /// + /// @brief Add ctrl-link among nodes in graph + /// @param [in] src_name + /// @param [in] dst_name + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &AddControlLink(const std::string &src_name, const std::string &dst_name) override; + + /// + /// @brief Set index_th input anchor for graph + /// @param [in] index + /// @param [in] node_names + /// @param [in] anchor_inds + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &SetInput(uint32_t index, const std::vector &node_names, + const std::vector &anchor_inds); + + /// + /// @brief Set index_th input of graph as useless + /// @param [in] index + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &SetUselessInput(uint32_t index); + + /// + /// @brief Add output anchor for graph + /// @param [in] owner_node_name + /// @param [in] anchor_ind + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &AddOutput(const std::string &owner_node_name, uint32_t anchor_ind); + + /// + /// @brief Set parent-node of graph + /// @param [in] parent_node + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &SetParentNode(const NodePtr &parent_node); + + /// + /// @brief Set mapping-relation of parent-node in_anchor_ind & Data-node + /// @param [in] input_mapping: index_of_graph_input -> in_anchor_index_of_parent_node + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &SetInputMapping(const std::map &input_mapping); + + /// + /// @brief Set mapping-relation of parent-node out_anchor_ind & NetOutput-node out_anchor_ind + /// @param [in] output_mapping: index_of_graph_output -> out_anchor_index_of_parent_node + /// @return CompleteGraphBuilder + /// + CompleteGraphBuilder &SetOutputMapping(const std::map &output_mapping); + + /// + /// @brief Build graph + /// @param [out] error_code + /// @param [out] error_msg + /// @return ComputeGraphPtr + /// + ComputeGraphPtr Build(graphStatus &error_code, std::string &error_msg) override; + + private: + /// + /// @brief Build inputs + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + void BuildInputs(graphStatus &error_code, std::string &error_msg); + + /// + /// @brief Add data node + /// @param [in] index + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + NodePtr AddDateNode(uint32_t index, graphStatus &error_code, std::string &error_msg); + + /// + /// @brief Build outputs + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + void BuildOutputs(graphStatus &error_code, std::string &error_msg); + + /// + /// @brief Add NetOutput node + /// @param [out] error_code + /// @param [out] error_msg + /// @return NodePtr + /// + NodePtr AddNetOutputNode(graphStatus &error_code, std::string &error_msg); + + /// + /// @brief Add input/output tensor for NetOutput node + /// @param [in] out_nodes_info + /// @param [out] net_output_desc + /// @return graphStatus + /// + graphStatus BuildInOutForNetOutput(const std::vector> &out_nodes_info, + OpDescPtr &net_output_desc); + + /// + /// @brief Add edge for NetOutput node + /// @param [in] out_nodes_info + /// @param [out] net_output_node + /// @return graphStatus + /// + graphStatus AddEdgeForNetOutput(const std::vector> &out_nodes_info, + const NodePtr &net_output_node); + + std::string name_; + NodePtr parent_node_; + std::map, std::vector>> graph_inputs_; + std::vector> graph_outputs_; + + // index_of_graph_input -> in_anchor_index_of_parent_node + std::map input_mapping_; + // index_of_graph_output -> out_anchor_index_of_parent_node + std::map output_mapping_; +}; + +class PartialGraphBuilder : public ComputeGraphBuilder { + public: + PartialGraphBuilder() = default; + PartialGraphBuilder(const PartialGraphBuilder &) = delete; + PartialGraphBuilder &operator=(const PartialGraphBuilder &) = delete; + PartialGraphBuilder(const PartialGraphBuilder &&) = delete; + PartialGraphBuilder &operator=(const PartialGraphBuilder &&) = delete; + ~PartialGraphBuilder() = default; + + /// + /// @brief Add node to graph + /// @param [in] op_desc + /// @return PartialGraphBuilder + /// + PartialGraphBuilder &AddNode(const OpDescPtr &op_desc) override; + + /// + /// @brief Add data-link among nodes in graph + /// @param [in] src_name + /// @param [in] out_anchor_ind + /// @param [in] dst_name + /// @param [in] in_anchor_ind + /// @return PartialGraphBuilder + /// + PartialGraphBuilder &AddDataLink(const std::string &src_name, uint32_t out_anchor_ind, const std::string &dst_name, + uint32_t in_anchor_ind) override; + + /// + /// @brief Add ctrl-link among nodes in graph + /// @param [in] src_name + /// @param [in] dst_name + /// @return PartialGraphBuilder + /// + PartialGraphBuilder &AddControlLink(const std::string &src_name, const std::string &dst_name) override; + + /// + /// @brief Set owner graph + /// @param [in] graph + /// @return PartialGraphBuilder + /// + PartialGraphBuilder &SetOwnerGraph(const ComputeGraphPtr &graph); + + /// + /// @brief Add exist node + /// @param [in] node + /// @return PartialGraphBuilder + /// + PartialGraphBuilder &AddExistNode(const NodePtr &node); + + /// + /// @brief Build multi nodes with links + /// @param [out] error_code + /// @param [out] error_msg + /// @return ComputeGraphPtr + /// + ComputeGraphPtr Build(graphStatus &error_code, std::string &error_msg) override; + + private: + /// + /// @brief Build exist nodes + /// @param [out] error_code + /// @param [out] error_msg + /// @return void + /// + void BuildExistNodes(graphStatus &error_code, std::string &error_msg); + + std::vector exist_nodes_; }; } // namespace ge diff --git a/inc/graph/utils/node_utils.h b/inc/graph/utils/node_utils.h index 3902ed08..c979f727 100644 --- a/inc/graph/utils/node_utils.h +++ b/inc/graph/utils/node_utils.h @@ -56,6 +56,11 @@ class NodeUtils { static graphStatus UpdateOutputShape(const Node &node, uint32_t index, const GeShape &shape); static graphStatus UpdateInputShape(const Node &node, uint32_t index, const GeShape &shape); + static std::string GetNodeType(const Node &node); + + static ComputeGraphPtr GetSubgraph(const Node &node, uint32_t index); + static graphStatus AddSubgraph(Node &node, const ComputeGraphPtr &subgraph); + private: static std::map> map_send_info_; static std::map> map_recv_info_; diff --git a/inc/graph/utils/op_desc_utils.h b/inc/graph/utils/op_desc_utils.h index 363e0ed5..210ba0a5 100644 --- a/inc/graph/utils/op_desc_utils.h +++ b/inc/graph/utils/op_desc_utils.h @@ -20,7 +20,6 @@ #include #include #include - #include "graph/def_types.h" #include "graph/node.h" #include "graph/op_desc.h" @@ -29,7 +28,6 @@ namespace ge { class OpDesc; - using OpDescPtr = std::shared_ptr; class OpDescUtils { @@ -39,55 +37,108 @@ class OpDescUtils { OpDescUtils() = default; ~OpDescUtils() = default; - static bool HasQuantizeFactorParams(const OpDescPtr &op_desc); - static bool HasQuantizeFactorParams(const OpDesc &op_desc); - static graphStatus GetQuantizeFactorParams(const OpDescPtr &op_desc, QuantizeFactorParams &quant); - static graphStatus GetQuantizeFactorParams(const OpDesc &op_desc, QuantizeFactorParams &quant); - static graphStatus SetQuantizeFactorParams(const OpDescPtr &op_desc, const QuantizeFactorParams &quant); - static graphStatus SetQuantizeFactorParams(OpDesc &op_desc, const QuantizeFactorParams &quant); - - static vector GetConstInputNode(const ge::Node &node); - static vector GetInputData(const vector &input_nodes); - - static vector GetWeights(const ge::Node &node); - static vector GetWeights(const ge::ConstNodePtr &node); - static vector MutableWeights(const ge::Node &node); + static bool HasQuantizeFactorParams(const OpDescPtr& op_desc); + static bool HasQuantizeFactorParams(const OpDesc& op_desc); + static graphStatus GetQuantizeFactorParams(const OpDescPtr& op_desc, QuantizeFactorParams& quant); + static graphStatus GetQuantizeFactorParams(const OpDesc& op_desc, QuantizeFactorParams& quant); + static graphStatus SetQuantizeFactorParams(const OpDescPtr& op_desc, const QuantizeFactorParams& quant); + static graphStatus SetQuantizeFactorParams(OpDesc& op_desc, const QuantizeFactorParams& quant); + + static vector GetConstInputNode(const ge::Node& node); + static vector GetInputData(const vector& input_nodes); + + static vector GetWeights(const ge::Node& node); + static vector GetWeights(const ge::ConstNodePtr& node); + static vector MutableWeights(const ge::Node& node); static vector MutableWeights(const ge::NodePtr node); - static graphStatus SetWeights(ge::Node &node, const vector &weights); - static graphStatus SetWeights(ge::NodePtr node, const vector &weights); + static graphStatus SetWeights(ge::Node& node, const vector& weights); + static graphStatus SetWeights(ge::NodePtr node, const vector& weights); static graphStatus ClearWeights(ge::NodePtr node); static bool ClearInputDesc(ge::OpDescPtr op_desc, uint32_t index); - static bool ClearInputDesc(const ge::NodePtr &node); - static bool ClearOutputDesc(const ge::OpDescPtr &op_desc, uint32_t index); - static bool ClearOutputDesc(const ge::NodePtr &node); - static vector GetConstInputs(const ge::Node &node); - static vector GetConstInputs(const ge::ConstNodePtr &node); - static size_t GetNonConstInputsSize(const ge::Node &node); + static bool ClearInputDesc(const ge::NodePtr& node); + static bool ClearOutputDesc(const ge::OpDescPtr& op_desc, uint32_t index); + static bool ClearOutputDesc(const ge::NodePtr& node); + static vector GetConstInputs(const ge::Node& node); + static vector GetConstInputs(const ge::ConstNodePtr& node); + static size_t GetNonConstInputsSize(const ge::Node& node); static size_t GetNonConstInputsSize(ge::ConstNodePtr node); - // Index: Indicate the index of all non const inputs - static GeTensorDesc GetNonConstInputTensorDesc(const ge::Node &node, size_t index_non_const = 0); - static GeTensorDesc GetNonConstInputTensorDesc(const ge::ConstNodePtr &node, size_t index_non_const = 0); - static bool GetNonConstInputIndex(const ge::Node &node, size_t index_non_const, size_t &index); - static bool GetNonConstInputIndex(const ge::ConstNodePtr &node, size_t index_non_const, size_t &index); - // Index: Indicate the index of all inputs - static bool IsNonConstInput(const ge::Node &node, size_t index = 0); - static bool IsNonConstInput(const ge::ConstNodePtr &node, size_t index = 0); - - static vector GetNonConstTensorDesc(const ge::ConstNodePtr &node); - static graphStatus AddConstOpToAnchor(InDataAnchorPtr in_anchor, const GeTensorPtr &tensor_ptr); + // Index: Indicates the index of all non const inputs + static GeTensorDesc GetNonConstInputTensorDesc(const ge::Node& node, size_t index_non_const = 0); + static GeTensorDesc GetNonConstInputTensorDesc(const ge::ConstNodePtr& node, size_t index_non_const = 0); + static bool GetNonConstInputIndex(const ge::Node& node, size_t index_non_const, size_t& index); + static bool GetNonConstInputIndex(const ge::ConstNodePtr& node, size_t index_non_const, size_t& index); + // Index: Indicates the index of all inputs + static bool IsNonConstInput(const ge::Node& node, size_t index = 0); + static bool IsNonConstInput(const ge::ConstNodePtr& node, size_t index = 0); + + static vector GetNonConstTensorDesc(const ge::ConstNodePtr& node); + static graphStatus AddConstOpToAnchor(InDataAnchorPtr in_anchor, const GeTensorPtr& tensor_ptr); static Operator CreateOperatorFromOpDesc(OpDescPtr op_desc); static Operator CreateOperatorFromNode(ge::ConstNodePtr node_ptr); - static OpDescPtr GetOpDescFromOperator(const Operator &oprt); + static OpDescPtr GetOpDescFromOperator(const Operator& oprt); - static OpDescPtr CreateConstOp(const GeTensorPtr &tensor_ptr); + static OpDescPtr CreateConstOp(const GeTensorPtr& tensor_ptr); private: - static GeTensorPtr MutableWeights(ge::OpDesc &op_desc); + static GeTensorPtr MutableWeights(ge::OpDesc& op_desc); static GeTensorPtr MutableWeights(ge::OpDescPtr op_desc); - static graphStatus SetWeights(ge::OpDesc &op_desc, const GeTensorPtr weight); + static graphStatus SetWeights(ge::OpDesc& op_desc, const GeTensorPtr weight); static graphStatus SetWeights(ge::OpDescPtr op_desc, const GeTensorPtr weight); }; + +class OpDescBuilder { + public: + OpDescBuilder(std::string name, std::string type) : name_(std::move(name)), type_(std::move(type)) {} + OpDescBuilder(const OpDescBuilder&) = delete; + OpDescBuilder& operator=(const OpDescBuilder&) = delete; + OpDescBuilder(const OpDescBuilder&&) = delete; + OpDescBuilder& operator=(const OpDescBuilder&&) = delete; + ~OpDescBuilder() = default; + + /// + /// @brief Add input + /// @param [in] name + /// @return OpDescBuilder + /// + OpDescBuilder& AddInput(const std::string& name); + + /// + /// @brief Add dynamic input + /// @param [in] name + /// @param [in] num + /// @return OpDescBuilder + /// + OpDescBuilder& AddDynamicInput(const std::string& name, uint32_t num); + + /// + /// @brief Add output + /// @param [in] name + /// @return OpDescBuilder + /// + OpDescBuilder& AddOutput(const std::string& name); + + /// + /// @brief Add dynamic output + /// @param [in] name + /// @param [in] num + /// @return OpDescBuilder + /// + OpDescBuilder& AddDynamicOutput(const std::string& name, uint32_t num); + + /// + /// @brief Build op_desc + /// @return OpDescPtr + /// + OpDescPtr Build(); + + private: + std::string name_; + std::string type_; + std::vector inputs_; + std::vector outputs_; +}; } // namespace ge + #endif // INC_GRAPH_UTILS_OP_DESC_UTILS_H_ diff --git a/inc/graph/utils/tensor_utils.h b/inc/graph/utils/tensor_utils.h index 934ad12f..2fa398db 100644 --- a/inc/graph/utils/tensor_utils.h +++ b/inc/graph/utils/tensor_utils.h @@ -18,15 +18,14 @@ #define INC_GRAPH_UTILS_TENSOR_UTILS_H_ #include - #include "graph/def_types.h" #include "graph/ge_error_codes.h" #include "graph/ge_tensor.h" namespace ge { class TensorUtils { public: - static ge::graphStatus GetSize(const GeTensorDesc &tensorDesc, uint32_t &size); - static void SetSize(GeTensorDesc &tensorDesc, uint32_t size); + static ge::graphStatus GetSize(const GeTensorDesc &tensorDesc, int64_t &size); + static void SetSize(GeTensorDesc &tensorDesc, int64_t size); static uint32_t GetWeightSize(const ConstGeTensorPtr &tensorPtr); static uint32_t GetWeightSize(const GeTensor &tensor); static uint32_t GetWeightSize(const GeTensorDesc &tensorDesc); @@ -62,16 +61,16 @@ class TensorUtils { static void SetRC(GeTensorDesc &tensorDesc, uint32_t rc); /// - /// calculate mem size of the tensor. + /// calculate tensor mem size. /// @param shape tensor shape /// @param format tensor format /// @param data_type tensor data type - /// @param mem_size -1 means unknown shape,others means mem size - /// @return GRAPH_SUCCESS:success, others:failed + /// @param mem_size -1 means unknown shape,other means mem size + /// @return GRAPH_SUCCESS:success, other:failed /// static ge::graphStatus CalcTensorMemSize(const GeShape &shape, Format format, DataType data_type, int64_t &mem_size); - static ge::graphStatus GetTensorMemorySizeInBytes(const GeTensorDesc &desc_temp, uint32_t &size_temp); - static ge::graphStatus GetTensorSizeInBytes(const GeTensorDesc &desc_temp, uint32_t &size_temp); + static ge::graphStatus GetTensorMemorySizeInBytes(const GeTensorDesc &desc_temp, int64_t &size_temp); + static ge::graphStatus GetTensorSizeInBytes(const GeTensorDesc &desc_temp, int64_t &size_temp); }; } // namespace ge #endif // INC_GRAPH_UTILS_TENSOR_UTILS_H_ diff --git a/src/common/graph/CMakeLists.txt b/src/common/graph/CMakeLists.txt index 03c93421..56b68c69 100755 --- a/src/common/graph/CMakeLists.txt +++ b/src/common/graph/CMakeLists.txt @@ -58,6 +58,7 @@ include_directories(${GE_SOURCE_DIR}/inc/external/graph) include_directories(${GE_SOURCE_DIR}/inc/graph) include_directories(${GE_SOURCE_DIR}/inc/common) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc) +include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops) include_directories(${GE_SOURCE_DIR}/third_party/securec/include) include_directories(${CMAKE_BINARY_DIR}) include_directories(${CMAKE_BINARY_DIR}/proto/ge) diff --git a/src/common/graph/anchor.cc b/src/common/graph/anchor.cc index 592d8b78..f02037e5 100644 --- a/src/common/graph/anchor.cc +++ b/src/common/graph/anchor.cc @@ -26,6 +26,8 @@ Anchor::Anchor(const NodePtr &owner_node, int idx) : owner_node_(owner_node), id bool Anchor::IsTypeOf(TYPE type) const { return strcmp(Anchor::TypeOf(), type) == 0; } +size_t Anchor::GetPeerAnchorsSize() const { return peer_anchors_.size(); } + Anchor::Vistor Anchor::GetPeerAnchors() const { vector ret; for (const auto &anchor : peer_anchors_) { diff --git a/src/common/graph/buffer.cc b/src/common/graph/buffer.cc index ba43377a..48cdd397 100644 --- a/src/common/graph/buffer.cc +++ b/src/common/graph/buffer.cc @@ -32,8 +32,7 @@ Buffer::Buffer(const Buffer &other) { buffer_ = other.buffer_; } -// default -Buffer::Buffer(std::size_t buffer_size, std::uint8_t default_val) : Buffer() { +Buffer::Buffer(std::size_t buffer_size, std::uint8_t default_val) : Buffer() { // default auto proto_msg = data_.GetProtoMsg(); if (proto_msg != nullptr) { try { diff --git a/src/common/graph/compute_graph.cc b/src/common/graph/compute_graph.cc index 70ddb00f..a35747d4 100644 --- a/src/common/graph/compute_graph.cc +++ b/src/common/graph/compute_graph.cc @@ -15,9 +15,7 @@ */ #include "graph/compute_graph.h" - #include - #include "./format_refiner.h" #include "./ge_context.h" #include "debug/ge_attr_define.h" @@ -41,7 +39,7 @@ const size_t OUTPUT_PARAM_SIZE = 2; } // namespace GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraph::ComputeGraph(const std::string &name) - : nodes_(), input_nodes_(), sub_graph_(), name_(name), is_valid_flag_(false), need_iteration_(false) { + : name_(name), nodes_(), input_nodes_(), sub_graph_(), is_valid_flag_(false), need_iteration_(false) { attrs_.InitDefault(); } ComputeGraph::~ComputeGraph() {} @@ -154,7 +152,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ComputeGraph::VectorInputNod GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ComputeGraph::GraphMembersAreEqual( const ComputeGraph &r_graph) const { - return (IsEqual(this->sub_graph_.size(), r_graph.sub_graph_.size(), "graph.sub_graph_.size()") && + return (IsEqual(this->sub_graph_.size(), r_graph.sub_graph_.size(), "graph.subgraphs_.size()") && IsEqual(this->nodes_.size(), r_graph.nodes_.size(), "graph.nodes_.size()") && VectorInputNodePtrIsEqual(this->input_nodes_, r_graph.input_nodes_) && IsEqual(this->name_, r_graph.name_, "graph.name_") && @@ -398,6 +396,165 @@ graphStatus ComputeGraph::RemoveSubGraph(const std::shared_ptr &su } } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus +ComputeGraph::AddSubgraph(const std::string &name, const std::shared_ptr &subgraph) { + if (subgraph == nullptr) { + GE_LOGE("Try to add a null subgraph, name %s", name.c_str()); + return GRAPH_PARAM_INVALID; + } + auto parent_graph = subgraph->GetParentGraph(); + if (parent_graph == nullptr) { + GE_LOGE("Try to add subgraph without parent graph, name %s", name.c_str()); + return GRAPH_PARAM_INVALID; + } + auto parent_node = subgraph->GetParentNode(); + if (parent_node == nullptr) { + GE_LOGE("Try to add a subgraph without parent node, name %s", name.c_str()); + return GRAPH_PARAM_INVALID; + } + if (parent_node->GetOwnerComputeGraph() != parent_graph) { + GE_LOGE( + "Try to add a subgraph which parent node's parent graph is not equal to " + "the subgraph's parent graph, subgraph name %s, parent node name %s", + subgraph->GetName().c_str(), parent_graph->GetName().c_str()); + return GRAPH_PARAM_INVALID; + } + if (!this->parent_graph_.expired()) { + GE_LOGE("The subgraphs can only be added to the root graph"); + return GRAPH_PARAM_INVALID; + } + if (name != subgraph->GetName()) { + GELOGW("The subgraph name %s is different with input %s", subgraph->GetName().c_str(), name.c_str()); + } + sub_graph_.push_back(subgraph); + names_to_subgraph_[name] = subgraph; + return GRAPH_SUCCESS; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus +ComputeGraph::AddSubgraph(const std::shared_ptr &subgraph) { + if (subgraph == nullptr) { + return GRAPH_PARAM_INVALID; + } + return AddSubgraph(subgraph->GetName(), subgraph); +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ComputeGraph::RemoveSubgraph(const std::string &name) { + auto iter = names_to_subgraph_.find(name); + if (iter == names_to_subgraph_.end()) { + return; + } + for (auto vec_iter = sub_graph_.begin(); vec_iter != sub_graph_.end(); ++vec_iter) { + if (*vec_iter == iter->second) { + sub_graph_.erase(vec_iter); + break; + } + } + names_to_subgraph_.erase(iter); +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ComputeGraph::RemoveSubgraph( + const std::shared_ptr &subgraph) { + if (subgraph != nullptr) { + RemoveSubgraph(subgraph->GetName()); + } +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY std::shared_ptr ComputeGraph::GetSubgraph( + const std::string &name) const { + auto iter = names_to_subgraph_.find(name); + return iter == names_to_subgraph_.end() ? nullptr : iter->second; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY std::vector> +ComputeGraph::GetAllSubgraphs() const { + return sub_graph_; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY shared_ptr ComputeGraph::GetParentGraph() { + return parent_graph_.lock(); +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ComputeGraph::SetParentGraph( + const shared_ptr &parent) { + parent_graph_ = parent; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY shared_ptr ComputeGraph::GetParentNode() { + return parent_node_.lock(); +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ComputeGraph::SetParentNode(const shared_ptr &parent) { + parent_node_ = parent; +} + +/// +/// @brief Update input-mapping +/// @param [in] input_mapping : index_of_cur_graph_node_input -> index_of_new_graph_node_input +/// @return graphStatus +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus +ComputeGraph::UpdateInputMapping(const std::map &input_mapping) { + for (auto &input : input_nodes_) { + uint32_t cur_index = 0; + if (!ge::AttrUtils::GetInt(input->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, cur_index)) { + continue; + } + auto iter = input_mapping.find(cur_index); + if (iter == input_mapping.end()) { + continue; + } + if (!ge::AttrUtils::SetInt(input->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, iter->second)) { + GE_LOGE("UpdateInputMapping failed: set attr ATTR_NAME_PARENT_NODE_INDEX failed."); + return GRAPH_FAILED; + } + } + + return GRAPH_SUCCESS; +} + +/// +/// @brief Update output-mapping +/// @param [in] output_mapping : index_of_cur_graph_node_output -> index_of_new_graph_node_output +/// @return graphStatus +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus +ComputeGraph::UpdateOutputMapping(const std::map &output_mapping) { + NodePtr net_output = FindNode(kNodeNameNetOutput); + if (net_output == nullptr) { + GE_LOGE("UpdateOutputMapping failed: node %s not exist in graph.", kNodeNameNetOutput); + return GRAPH_FAILED; + } + OpDescPtr op_desc = net_output->GetOpDesc(); + if (op_desc == nullptr) { + GE_LOGE("UpdateOutputMapping failed: op_desc is NULL."); + return GRAPH_FAILED; + } + + size_t num = op_desc->GetInputsSize(); + for (size_t i = 0; i < num; i++) { + GeTensorDesc tensor = op_desc->GetInputDesc(i); + uint32_t cur_index = 0; + if (!ge::AttrUtils::GetInt(tensor, ATTR_NAME_PARENT_NODE_INDEX, cur_index)) { + continue; + } + auto iter = output_mapping.find(cur_index); + if (iter == output_mapping.end()) { + continue; + } + if (!ge::AttrUtils::SetInt(tensor, ATTR_NAME_PARENT_NODE_INDEX, iter->second)) { + GE_LOGE("UpdateOutputMapping failed: set attr ATTR_NAME_PARENT_NODE_INDEX failed."); + return GRAPH_FAILED; + } + if (op_desc->UpdateInputDesc(i, tensor) != GRAPH_SUCCESS) { + GE_LOGE("UpdateOutputMapping failed: update %u input_tensor failed.", i); + return GRAPH_FAILED; + } + } + + return GRAPH_SUCCESS; +} + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::InsertEventNodes() { std::vector node_vec = nodes_; for (const auto &node : GetAllNodes()) { @@ -551,6 +708,23 @@ graphStatus ComputeGraph::CollectBreadthOutNode(const NodePtr &node, std::mapTopologicalSortingSubgraph(); + if (ret != SUCCESS) { + GELOGE(ret, "Sub graph topological sort Failed"); + return ret; + } + } + return SUCCESS; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::TopologicalSortingSubgraph() { std::vector node_vec; std::map map_in_edge_num; bool use_BFS = false; @@ -598,6 +772,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::Topolog node->GetOpDesc()->SetId(i); // [node->GetOpDesc(): should not be null] nodes_.push_back(node); } + is_valid_flag_ = true; return GRAPH_SUCCESS; } @@ -614,7 +789,7 @@ graphStatus ComputeGraph::SortNodes(std::vector &stack, std::mapGetOpDesc() == nullptr, continue); map_in_edge_num[node] = static_cast(GetInEdgeSize(node)); if (map_in_edge_num[node] == 0) { @@ -640,16 +815,16 @@ graphStatus ComputeGraph::SortNodes(std::vector &stack, std::mapGetName()); + GE_IF_BOOL_EXEC(it_i == inputs_order_.end(), continue); + auto inx_i = it_i - inputs_order_.begin(); for (size_t j = i + 1; j < stack.size(); ++j) { // If not found in 'inputs_order_', skip it - auto it_i = std::find(inputs_order_.begin(), inputs_order_.end(), stack[i]->GetName()); - GE_IF_BOOL_EXEC(it_i == inputs_order_.end(), continue); auto it_j = std::find(inputs_order_.begin(), inputs_order_.end(), stack[j]->GetName()); GE_IF_BOOL_EXEC(it_j == inputs_order_.end(), continue); // Compare index, swap them if it should be - auto inx_i = it_i - inputs_order_.begin(); auto inx_j = it_j - inputs_order_.begin(); GE_IF_BOOL_EXEC(inx_i < inx_j, std::swap(stack[i], stack[j])); } @@ -663,7 +838,7 @@ size_t ComputeGraph::GetInEdgeSize(const NodePtr &node) { return in_edge_size; } for (const auto &anchor : node->GetAllInDataAnchors()) { - in_edge_size = in_edge_size + anchor->GetPeerAnchors().size(); + in_edge_size = in_edge_size + anchor->GetPeerAnchorsSize(); // Break flow control data loop. OutDataAnchorPtr out_anchor = anchor->GetPeerOutAnchor(); if ((out_anchor != nullptr) && (out_anchor->GetOwnerNode() != nullptr)) { @@ -680,10 +855,11 @@ size_t ComputeGraph::GetInEdgeSize(const NodePtr &node) { } } if (node->GetInControlAnchor() != nullptr) { - in_edge_size = in_edge_size + node->GetInControlAnchor()->GetPeerAnchors().size(); + in_edge_size = in_edge_size + node->GetInControlAnchor()->GetPeerAnchorsSize(); } return in_edge_size; } + size_t ComputeGraph::GetOutEdgeSize(const NodePtr &node) { size_t out_edge_size = 0; if (node == nullptr) { @@ -699,7 +875,7 @@ size_t ComputeGraph::GetOutEdgeSize(const NodePtr &node) { } } if (node->GetOutControlAnchor() != nullptr) { - if (out_edge_size > (UINT32_MAX - node->GetOutControlAnchor()->GetPeerAnchors().size())) { + if (out_edge_size > (UINT64_MAX - node->GetOutControlAnchor()->GetPeerAnchors().size())) { return 0; } out_edge_size = out_edge_size + node->GetOutControlAnchor()->GetPeerAnchors().size(); @@ -724,17 +900,18 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ComputeGraph::Dump() const { peer_in_anchor->GetOwnerNode()->GetName().c_str())); } } - GE_IF_BOOL_EXEC(node->GetOutControlAnchor() == nullptr, GELOGE(GRAPH_FAILED, "Out control anchor is null"); - return ); - for (const auto &peer_in_anchor : node->GetOutControlAnchor()->GetPeerInControlAnchors()) { - GE_IF_BOOL_EXEC(peer_in_anchor != nullptr && peer_in_anchor->GetOwnerNode() != nullptr, - GELOGI("node name = %s, out control node name = %s.", node->GetName().c_str(), - peer_in_anchor->GetOwnerNode()->GetName().c_str())); - } - for (const auto &peer_in_anchor : node->GetOutControlAnchor()->GetPeerInDataAnchors()) { - GE_IF_BOOL_EXEC(peer_in_anchor != nullptr && peer_in_anchor->GetOwnerNode() != nullptr, - GELOGI("node name = %s, out control node name = %s.", node->GetName().c_str(), - peer_in_anchor->GetOwnerNode()->GetName().c_str())); + auto out_control_anchor = node->GetOutControlAnchor(); + if (out_control_anchor != nullptr) { + for (const auto &peer_in_anchor : out_control_anchor->GetPeerInControlAnchors()) { + GE_IF_BOOL_EXEC(peer_in_anchor != nullptr && peer_in_anchor->GetOwnerNode() != nullptr, + GELOGI("node name = %s, out control node name = %s.", node->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str())); + } + for (const auto &peer_in_anchor : out_control_anchor->GetPeerInDataAnchors()) { + GE_IF_BOOL_EXEC(peer_in_anchor != nullptr && peer_in_anchor->GetOwnerNode() != nullptr, + GELOGI("node name = %s, out control node name = %s.", node->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str())); + } } } } diff --git a/src/common/graph/debug/ge_log.h b/src/common/graph/debug/ge_log.h index a72b5886..14a66709 100644 --- a/src/common/graph/debug/ge_log.h +++ b/src/common/graph/debug/ge_log.h @@ -18,21 +18,9 @@ #define COMMON_GRAPH_DEBUG_GE_LOG_H_ #include "graph/ge_error_codes.h" -#include "toolchain/slog.h" #include "framework/common/debug/ge_log.h" -#define GE_MOD_ID GE - -#ifdef _MSC_VER -#define FUNC_NAME __FUNCTION__ -#else -#define FUNC_NAME __PRETTY_FUNCTION__ -#endif - -#define D_GE_LOGE(fmt, ...) \ - dlog_error(static_cast(GE_MOD_ID), "%s:" fmt, __FUNCTION__, ##__VA_ARGS__) - -#define GE_LOGE(...) D_GE_LOGE(__VA_ARGS__) +#define GE_LOGE(...) GE_LOG_ERROR(GE_MODULE_NAME, ge::FAILED, __VA_ARGS__) #define GE_LOGI_IF(condition, ...) \ if ((condition)) { \ @@ -44,15 +32,15 @@ GELOGW(__VA_ARGS__); \ } -#define GE_LOGE_IF(condition, ...) \ - if ((condition)) { \ - GELOGE(ge::GRAPH_FAILED, __VA_ARGS__); \ +#define GE_LOGE_IF(condition, ...) \ + if ((condition)) { \ + GELOGE(ge::FAILED, __VA_ARGS__); \ } #define GE_CHK_STATUS_RET_NOLOG(expr) \ do { \ const ge::graphStatus _status = (expr); \ - if (_status != ge::GRAPH_SUCCESS) { \ + if (ge::SUCCESS != _status) { \ return _status; \ } \ } while (0) @@ -61,7 +49,7 @@ do { \ bool b = (expr); \ if (!b) { \ - GELOGE(ge::GRAPH_FAILED, __VA_ARGS__); \ + GELOGE(ge::FAILED, __VA_ARGS__); \ return _status; \ } \ } while (0) @@ -85,7 +73,7 @@ do { \ const ge::graphStatus _status = (expr); \ if (_status) { \ - GELOGE(ge::GRAPH_FAILED, __VA_ARGS__); \ + GELOGE(ge::FAILED, __VA_ARGS__); \ return _status; \ } \ } while (0) @@ -95,7 +83,7 @@ { \ bool b = (expr); \ if (b) { \ - GELOGE(ge::GRAPH_FAILED, __VA_ARGS__); \ + GELOGE(ge::FAILED, __VA_ARGS__); \ exec_expr; \ } \ } @@ -119,63 +107,41 @@ } while (0) // If expr is not true, the log is printed and a custom statement is executed -#define GE_CHK_BOOL_EXEC(expr, exec_expr, ...) \ - { \ - bool b = (expr); \ - if (!b) { \ - GELOGE(ge::GRAPH_FAILED, __VA_ARGS__); \ - exec_expr; \ - } \ - } - -// If expr is not true, the log is printed and a custom statement is executed -#define GE_CHK_BOOL_EXEC_INFO(expr, exec_expr, ...) \ - { \ - bool b = (expr); \ - if (!b) { \ - GELOGI(__VA_ARGS__); \ - exec_expr; \ - } \ +#define GE_CHK_BOOL_EXEC(expr, exec_expr, ...) \ + { \ + bool b = (expr); \ + if (!b) { \ + GELOGE(ge::FAILED, __VA_ARGS__); \ + exec_expr; \ + } \ } // If expr is not true, the log is printed and a custom statement is executed -#define GE_CHK_BOOL_EXEC_DEBUG(expr, exec_expr, ...) \ - { \ - bool b = (expr); \ - if (!b) { \ - GELOGD(__VA_ARGS__); \ - exec_expr; \ - } \ +#define GE_CHK_BOOL_EXEC_INFO(expr, exec_expr, ...) \ + { \ + bool b = (expr); \ + if (!b) { \ + GELOGI(__VA_ARGS__); \ + exec_expr; \ + } \ } // If expr is not GRAPH_SUCCESS, print the log and return the same value -#define GE_CHK_STATUS_RET(expr, ...) \ - do { \ - const ge::graphStatus _status = (expr); \ - if (_status != ge::GRAPH_SUCCESS) { \ - GELOGE(ge::GRAPH_FAILED, __VA_ARGS__); \ - return _status; \ - } \ +#define GE_CHK_STATUS_RET(expr, ...) \ + do { \ + const ge::graphStatus _status = (expr); \ + if (ge::SUCCESS != _status) { \ + GELOGE(ge::FAILED, __VA_ARGS__); \ + return _status; \ + } \ } while (0) -#define GE_MAKE_SHARED(exec_expr0, exec_expr1) \ - try { \ - exec_expr0; \ - } catch (...) { \ - GELOGE(ge::GRAPH_FAILED, "Make shared failed"); \ - exec_expr1; \ +#define GE_MAKE_SHARED(exec_expr0, exec_expr1) \ + try { \ + exec_expr0; \ + } catch (...) { \ + GELOGE(ge::FAILED, "Make shared failed"); \ + exec_expr1; \ } -/// CCE related macro definition -/// If expr is not CC_STATUS_GRAPH_SUCCESS, print the log and return -#define GE_CHK_CCE_RET(expr) \ - do { \ - ccgraphStatus_t _cc_ret = (expr); \ - if (_cc_ret != CC_STATUS_GRAPH_SUCCESS) { \ - GELOGE(ge::GRAPH_FAILED, "Call cce api failed, ret: 0x%X", _cc_ret); \ - return ge::GRAPH_FAILED; \ - } \ - } while (0) - #endif // COMMON_GRAPH_DEBUG_GE_LOG_H_ - diff --git a/src/common/graph/debug/ge_util.h b/src/common/graph/debug/ge_util.h index d982e44b..4c6ae051 100644 --- a/src/common/graph/debug/ge_util.h +++ b/src/common/graph/debug/ge_util.h @@ -25,7 +25,6 @@ #include #include #include - #include "framework/common/debug/ge_log.h" #include "graph/debug/ge_log.h" #include "graph/ge_error_codes.h" diff --git a/src/common/graph/debug/graph_debug.cc b/src/common/graph/debug/graph_debug.cc index 930609b2..7ce9db37 100644 --- a/src/common/graph/debug/graph_debug.cc +++ b/src/common/graph/debug/graph_debug.cc @@ -15,12 +15,10 @@ */ #include "graph/debug/graph_debug.h" - #include #include #include #include "debug/ge_util.h" - #include "framework/common/debug/ge_log.h" #define TAB " " diff --git a/src/common/graph/debug/graph_debug.h b/src/common/graph/debug/graph_debug.h index 90548869..29de632a 100644 --- a/src/common/graph/debug/graph_debug.h +++ b/src/common/graph/debug/graph_debug.h @@ -16,13 +16,11 @@ #ifndef COMMON_GRAPH_DEBUG_GRAPH_DEBUG_H_ #define COMMON_GRAPH_DEBUG_GRAPH_DEBUG_H_ - #include #include #include #include #include - #include "external/graph/graph.h" #include "./ge_error_codes.h" #include "graph/compute_graph.h" diff --git a/src/common/graph/detail/attributes_holder.cc b/src/common/graph/detail/attributes_holder.cc index e75d5d1a..113f4b6f 100644 --- a/src/common/graph/detail/attributes_holder.cc +++ b/src/common/graph/detail/attributes_holder.cc @@ -15,9 +15,7 @@ */ #include "detail/attributes_holder.h" - #include - #include "debug/ge_log.h" #include "debug/ge_util.h" #include "framework/common/debug/ge_log.h" diff --git a/src/common/graph/format_refiner.cc b/src/common/graph/format_refiner.cc index 037b18c9..2230dc1b 100644 --- a/src/common/graph/format_refiner.cc +++ b/src/common/graph/format_refiner.cc @@ -14,14 +14,12 @@ * limitations under the License. */ -#include "graph/format_refiner.h" - +#include "format_refiner.h" #include #include #include #include #include - #include "./compute_graph.h" #include "./ge_error_codes.h" #include "./graph/ge_tensor.h" @@ -57,6 +55,7 @@ graphStatus FormatRefiner::RefreshConstantOutProcess(const OpDescPtr &op_desc) { } return GRAPH_SUCCESS; } + graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std::vector &anchor_points, std::vector &data_nodes, std::unordered_map &node_status) { @@ -82,10 +81,10 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std // consider special node save process // get all input desc format bool node_is_all_nd = false; - for (uint32_t i = 0; i < static_cast(op_desc->GetInputsSize()); i++) { - auto input_desc = op_desc->GetInputDesc(i); + auto input_size = static_cast(op_desc->GetInputsSize()); + for (uint32_t i = 0; i < input_size; i++) { // Operator pre-set format but not origin format - auto input_format = input_desc.GetFormat(); + auto input_format = op_desc->MutableInputDesc(i)->GetFormat(); // Pre-save data node and default infer fail if (node_ptr->GetType() == DATA) { data_nodes.push_back(node_ptr); @@ -95,9 +94,9 @@ graphStatus FormatRefiner::GetAnchorPoints(const ge::ComputeGraphPtr &graph, std } } // Get all output desc format - for (uint32_t i = 0; i < static_cast(op_desc->GetOutputsSize()); i++) { - GeTensorDesc output_desc = op_desc->GetOutputDesc(i); - auto output_format = output_desc.GetFormat(); + auto output_size = static_cast(op_desc->GetOutputsSize()); + for (uint32_t i = 0; i < output_size; i++) { + auto output_format = op_desc->MutableOutputDesc(i)->GetFormat(); if (output_format != FORMAT_ND && output_format != FORMAT_RESERVED) { node_is_all_nd = true; } @@ -145,7 +144,8 @@ graphStatus FormatRefiner::BackInferProcess(std::deque &nodes, ge:: for (const auto &in_anchor : node->GetAllInDataAnchors()) { GELOGD("Node is [%s] [B]", (node->GetName()).c_str()); auto in_data_anchor_idx = in_anchor->GetIdx(); - auto to_be_set_format = (node->GetOpDesc()->GetInputDesc(in_data_anchor_idx)).GetOriginFormat(); + auto to_be_set_format = + node->GetOpDesc()->MutableInputDesc(static_cast(in_data_anchor_idx))->GetOriginFormat(); if (to_be_set_format == FORMAT_ND) { GELOGD("Node [%s] [B], format is ND", (node->GetName()).c_str()); continue; @@ -162,7 +162,7 @@ graphStatus FormatRefiner::BackInferProcess(std::deque &nodes, ge:: } // Check format whether have been set int idx = peer_out_data_anchor->GetIdx(); - auto ge_tensor_desc = peer_out_data_node->GetOpDesc()->GetOutputDesc(idx); + auto ge_tensor_desc = peer_out_data_node->GetOpDesc()->GetOutputDesc(static_cast(idx)); if (ge_tensor_desc.GetOriginFormat() == FORMAT_ND) { auto dim_num = ge_tensor_desc.GetShape().GetDimNum(); if (dim_num == 0) { @@ -182,7 +182,7 @@ graphStatus FormatRefiner::BackInferProcess(std::deque &nodes, ge:: ge_tensor_desc.SetOriginFormat(to_be_set_format); ge_tensor_desc.SetFormat(to_be_set_format); - (void)peer_out_data_node->GetOpDesc()->UpdateOutputDesc(idx, ge_tensor_desc); + (void)peer_out_data_node->GetOpDesc()->UpdateOutputDesc(static_cast(idx), ge_tensor_desc); // Call operator infer format api (forward) to get out format GELOGD("call infer format func[Back]!Node is [%s] ", (peer_out_data_node->GetName()).c_str()); @@ -205,7 +205,8 @@ graphStatus FormatRefiner::ForwardInferProcess(std::deque &nodes, g GELOGD("Node is [%s] [F]", (node->GetName()).c_str()); GE_IF_BOOL_EXEC(out_data_anchor == nullptr, continue); auto out_data_anchor_idx = out_data_anchor->GetIdx(); - auto to_be_set_format = (node->GetOpDesc()->GetOutputDesc(out_data_anchor_idx)).GetOriginFormat(); + auto to_be_set_format = + node->GetOpDesc()->MutableOutputDesc(static_cast(out_data_anchor_idx))->GetOriginFormat(); if (to_be_set_format == FORMAT_ND) { GELOGD("Node [%s] format is ND.[F]", (node->GetName()).c_str()); continue; @@ -222,7 +223,7 @@ graphStatus FormatRefiner::ForwardInferProcess(std::deque &nodes, g } // Check format whether have been set int idx = peer_in_data_anchor->GetIdx(); - auto ge_tensor_desc = peer_in_data_node->GetOpDesc()->GetInputDesc(idx); + auto ge_tensor_desc = peer_in_data_node->GetOpDesc()->GetInputDesc(static_cast(idx)); if (ge_tensor_desc.GetOriginFormat() == FORMAT_ND) { auto dim_num = ge_tensor_desc.GetShape().GetDimNum(); if (dim_num == 0) { @@ -285,9 +286,9 @@ void FormatRefiner::SetInferOrigineFormatFlag(bool is_first) { is_first_infer = graphStatus FormatRefiner::DataNodeFormatProcess(std::vector &data_nodes, ge::Format data_format, std::unordered_map &node_status) { bool is_internal_format = TypeUtils::IsInternalFormat(data_format); - bool need_process = ((!is_first_infer) && (is_internal_format == false) && (data_format != FORMAT_ND)); + bool need_process = (!is_first_infer) && (!is_internal_format) && (data_format != FORMAT_ND); if (!need_process) { - GELOGI("no necessary to do DataNodeFormatProcess.IsFirstInfer: %d, data_format:%s", is_first_infer, + GELOGI("no necessary to do DataNodeFormatProcess.is_first_infer:%d, data_format:%s", is_first_infer, TypeUtils::FormatToSerialString(data_format).c_str()); return GRAPH_SUCCESS; } @@ -378,9 +379,9 @@ graphStatus FormatRefiner::InferOrigineFormat(const ge::ComputeGraphPtr &graph) /// Notice: ignore 5D formats auto data_format = graph->GetDataFormat(); status = DataNodeFormatProcess(data_nodes, data_format, node_status); - // Set infer flag to false SetInferOrigineFormatFlag(false); + return status; } } // namespace ge diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc index 55113191..92040051 100644 --- a/src/common/graph/ge_attr_define.cc +++ b/src/common/graph/ge_attr_define.cc @@ -42,6 +42,8 @@ const std::string ATTR_NAME_BIAS = "bias"; const std::string ATTR_NAME_BIAS_TERM = "bias_term"; +const std::string ATTR_NAME_HAS_BIAS_VALUE = "has_bias_value"; + const std::string ATTR_NAME_PAD = "pad"; const std::string ATTR_NAME_PADS = "pad"; @@ -83,6 +85,7 @@ const std::string ATTR_NAME_LRN_BETA = "lrn_beta"; const std::string ATTR_NAME_AXIS = "axis"; const std::string ATTR_NAME_BROADCAST = "broadcast"; +const std::string ATTR_NAME_OUTPUT = "output"; const std::string ATTR_NAME_OUTPUT_NUM = "output_num"; const std::string ATTR_NAME_TIDX = "t_idx"; @@ -103,6 +106,13 @@ const std::string ATTR_NAME_TSHAPE = "Tshape"; const std::string ATTR_NAME_NAN_OPT = "nan_opt"; const std::string ATTR_NAME_AIPP = "aipp"; +const std::string NEW_AIPP_CONV_OP = "new_conv_op_for_aipp"; + +const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id"; + +const std::string ATTR_NAME_MULTISHAPE_BATCHLIST = "multi_shape_batchlist"; +const std::string ATTR_NAME_MULTISHAPE_BATCHLIST_SIZE = "multi_shape_batchlist_size"; +const std::string ATTR_MODEL_BATCH_NUM = "batch_num"; const std::string ATTR_NAME_INPUT_FORMAT = "input_format"; const std::string ATTR_NAME_OUTPUT_FORMAT = "output_format"; @@ -111,6 +121,7 @@ const std::string ATTR_NAME_FRAMEWORK_NODE_DEF = "node_def"; const std::string ATTR_NAME_FRAMEWORK_OP_DEF = "op_def"; const std::string ATTR_NAME_FRAMEWORK_FWK_TYPE = "framework_type"; const std::string ATTR_NAME_FRAMEWORK_FUNC_DEF = "func_def"; +const std::string ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE = "original_type"; const std::string ATTR_NAME_INPUT_TENSOR_DESC = "input_tensor_desc"; const std::string ATTR_NAME_OUTPUT_TENSOR_DESC = "output_tensor_desc"; @@ -122,15 +133,11 @@ const std::string ATTR_NAME_WEIGHTS = "value"; const std::string ATTR_NAME_WEIGHTS_DATA = "weights_data"; const std::string ATTR_NAME_BROACAST_REAL_DIM_CNT = "broacast_real_dim_cnt"; const std::string ATTR_NAME_DIM_ALIGN = "dim_align"; -const std::string ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE = "original_type"; - -const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id"; - -const std::string ATTR_NAME_AUTOMIC_ADD_START = "automic_add_addr_start"; -const std::string ATTR_NAME_AUTOMIC_ADD_MEM_SIZE = "automic_add_mem_size"; -const std::string ATTR_MODEL_BATCH_NUM = "batch_num"; const std::string ATTR_NAME_STREAM_LABEL = "_stream_label"; const std::string ATTR_NAME_STREAM_CYCLE_EVENT_FLAG = "need_stream_cycle_event"; +const std::string ATTR_NAME_RTSWITCH_RECV_EVENT_ID = "rtswitch_event_id"; +const std::string ATTR_NAME_AUTOMIC_ADD_START = "automic_add_addr_start"; +const std::string ATTR_NAME_AUTOMIC_ADD_MEM_SIZE = "automic_add_mem_size"; // To be deleted const std::string ATTR_TO_BE_DELETED = "to_be_deleted"; @@ -144,15 +151,13 @@ const std::string SSD_MBOX_OCR_FUSION = "permute_flatten_ocr_fusion"; const std::string SSD_MBOX_FUSION_BOX_TYPE_NUM = "ssd_mbox_fusion_box_type_num"; const std::string SSD_RESHAPE_SLICE_CONCAT_FUSION = "reshape_slice_concat_fusion"; -const std::string SSD_PRIORBOX_CONCAT = "ssd_mbox_conf_priorbox_concat_flag"; - // Refinedet const std::string REFINEDET_MBOX_LOC_FUSION = "permute_flatten_fusion"; -const std::string REFINEDET_RESHAPE_SLICE_CONCAT_FUSION = "reshape_slice_concat_fusion"; + const std::string REFINEDET_MBOX_CONF_FUSION = "permute_flatten_reshape_flatten_fusion"; const std::string REFINEDET_MBOX_FUSION_BOX_TYPE_NUM = "ssd_mbox_fusion_box_type_num"; -const std::string REFINEDET_PRIOR_BOX_ATTR_VARIANCE = "variance"; -const std::string REFINEDET_PRIOR_BOX_ATTR_VARIANCE_NUM = "variance_num"; +const std::string REFINEDET_RESHAPE_SLICE_CONCAT_FUSION = "reshape_slice_concat_fusion"; +const std::string SSD_PRIORBOX_CONCAT = "ssd_mbox_conf_priorbox_concat_flag"; // _Arg const std::string ATTR_NAME_INDEX = "index"; @@ -242,6 +247,30 @@ const std::string BATCHNORM_ATTR_ESTIMATED_MEAN = "estimated_mean"; const std::string BATCHNORM_ATTR_ESTIMATED_VARIANCE = "estimated_variance"; const std::string BATCHNORM_ATTR_SCALE = "scale"; const std::string BATCHNORM_ATTR_BIAS = "bias"; +const std::string BATCHNORM_ATTR_DATA_FORMAT = "data_format"; +const std::string BATCHNORM_ATTR_IS_TRAINING = "is_training"; +const std::string BATCHNORM_ATTR_IS_TRAINING_FUSION = "is_training_fusion"; + +// huberloss +const std::string HUBER_LOSS_ATTR_DELTA = "delta"; + +// SSDRealDivTileMul +const std::string SSD_REAL_DIV_TILE_MUL_ATTR_TILE_PARA = "tilepara"; + +// SSDSumMulRealDivMean +const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_REDUCTION_INDICES = "reduction_indices"; +const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_AXIS = "axis"; +const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_MEAN_PARA = "mean_para"; +const std::string SSD_SUM_MUL_REALDIV_MEAN_ATTR_HAS_SUM = "has_sum"; + +// ConcatFive2Four +// ConcatFour2Five +const std::string SSD_BOX_TYPE_NUM = "box_type_num"; +const std::string SSD_CLASS_NUM = "class_num"; +const std::string TRANS_FOR_LOSS_MODE = "trans_for_loss_mode"; +const std::string SSD_FEATURE_MAP_SIZE = "feature_map_size"; +const std::string SSD_FEATURE_MAP_HIGH = "feature_map_high"; +const std::string SSD_FEATURE_MAP_WIDTH = "feature_map_width"; // Scale const std::string SCALE_ATTR_SCALE = "scale"; @@ -346,6 +375,7 @@ const std::string SOFTMAX_ATTR_AXIS = "axis"; // Permute const std::string PERMUTE_ATTR_ORDER = "order"; +const std::string PERMUTE_ATTR_PERM = "perm"; // SSD Normalize const std::string SSDNORMALIZE_ATTR_ACCROSS_SPATIAL = "across_spatial"; @@ -373,6 +403,10 @@ const std::string SSD_PRIOR_BOX_ATTR_ASPECT_RATIO_NUM = "aspect_ratio_num"; const std::string SSD_PRIOR_BOX_ATTR_VARIANCE = "variance"; const std::string SSD_PRIOR_BOX_ATTR_VARIANCE_NUM = "variance_num"; +// RefinedetDetectionOutput +const std::string REFINEDET_PRIOR_BOX_ATTR_VARIANCE_NUM = "variance_num"; +const std::string REFINEDET_PRIOR_BOX_ATTR_VARIANCE = "variance"; + // PRelu const std::string PRELU_ATTR_CHANNEL_SHARED = "channel_shared"; @@ -386,11 +420,16 @@ const std::string POWER_ATTR_NAME_POWER = "power"; const std::string POWER_ATTR_NAME_SCALE = "scale"; const std::string POWER_ATTR_NAME_SHIFT = "shift"; +// log +const std::string LOG_ATTR_NAME_SCALE = "scale"; +const std::string LOG_ATTR_NAME_SHIFT = "shift"; +const std::string LOG_ATTR_NAME_BASE = "base"; // Pack const std::string PACK_ATTR_NAME_NUM = "N"; // Unpack const std::string UNPACK_ATTR_NAME_NUM = "num"; +const std::string DYNAMIC_STITCH_ATTR_NAME_NUM = "DynamicStitchN_"; // Gathernd const std::string GATHERND_ATTR_NAME_TINDICES = "Tindices"; const std::string GATHERND_ATTR_NAME_TPARAMS = "Tparams"; @@ -400,6 +439,13 @@ const std::string ARGMAX_ATTR_NAME_TOPK = "topk"; const std::string ARGMAX_ATTR_NAME_REDUCESIZE = "reduce_size"; const std::string ARGMAX_ATTR_NAME_REDUCESTRIDE = "reduce_stride"; const std::string ARGMAX_ATTR_NAME_OUTMAX = "outmaxval"; +const std::string ARGMAX_ATTR_NAME_AXIS = "axis"; +const std::string ARGMAX_ATTR_NAME_AXISTYPE = "axis_type"; +const std::string ARGMAX_ATTR_NAME_KEEPDIMS = "keep_dims"; + +// upsample +const std::string UPSAMPLE_ATTR_NAME_SCALE_H = "scale_h"; +const std::string UPSAMPLE_ATTR_NAME_SCALE_W = "scale_w"; // Relu const std::string ATTR_NAME_NEGATIVE_SLOPE = "negative_slope"; @@ -416,6 +462,7 @@ const std::string SPLIT_ATTR_NAME_NUM_SPLIT = "num_split"; const std::string TVM_ATTR_NAME_MAGIC = "tvm_magic"; const std::string TVM_ATTR_NAME_BLOCKDIM = "tvm_blockdim"; const std::string TVM_ATTR_NAME_METADATA = "tvm_metadata"; +const std::string TVM_ATTR_NAME_WORKSPACE_TYPE = "tvm_workspace_type"; // Squeeze const std::string SQUEEZE_ATTR_AXIS = "axis"; @@ -438,6 +485,7 @@ const std::string ROIALIGN_ATTR_SPATIAL_SCALE = "spatial_scale"; const std::string ROIALIGN_ATTR_SAMPLING_RATIO = "sampling_ratio"; const std::string ROIALIGN_ATTR_NAME_POOLED_H = "pooled_h"; const std::string ROIALIGN_ATTR_NAME_POOLED_W = "pooled_w"; +const std::string ROIALIGN_ATTR_NAME_TF = "roialign_tf"; // Generate_rpn_proposal const std::string GENERATE_RPN_PROPOSAL_ATTR_PRE_NMS_TOPK = "pre_nms_topk"; @@ -536,19 +584,42 @@ const std::string CONV_GRAD_FILTER_OUTPUT_SHAPE = "conv_grad_filter_output_shape const std::string CONV_GRAD_INPUT_OUTPUT_SHAPE = "conv_grad_input_output_shape"; // Rnn -const std::string RNN_MODE_ = "rnn_"; -const std::string CNN_RNN = "cnn_rnn"; +const std::string RNN_TENSORFLOW = "rnn_tensorflow"; +const std::string RNN_MODE_STATIC = "rnn_static"; const std::string MUTI_RNN = "multi_rnn"; +const std::string CNN_RNN = "cnn_rnn"; +const std::string RNN_MODE_ = "rnn_"; + const std::string CELL_MODE = "mode"; const std::string LSTM_CELL = "lstm_cell"; const std::string GRU_CELL = "gru_cell"; const std::string RNN_HT = "ht"; const std::string RNN_XT_HT = "xt_ht"; const std::string RNN_BATCH_SIZE = "batch_size"; +const std::string LSTM_CELL_CLIP = "lstm_cell_clip"; +const std::string LSTM_PROJ_CLIP = "lstm_proj_clip"; +const std::string LSTM_ACTIVATE = "lstm_activate"; +const std::string LSTM_OUT_MAP = "lstm_out_map"; +const std::string LSTM_OUT_MODE = "lstm_out_mode"; +const std::string LSTM_STATE_OUT_MODE = "lstm_state_out_mode"; +const std::string LSTM_TIME_MAJOR = "lstm_time_major"; +const std::string LSTM_IS_INPUT_PRE_PROCESS = "lstm_is_input_pre_process"; // Upsample const std::string UPSAMPLE_ATTR_NAME_SCALE = "scale"; +// PadV2 +const std::string PADV2_ATTR_NAME_MODE = "mode"; +const std::string PADV2_ATTR_NAME_PADS = "paddings"; +const std::string PADV2_ATTR_NAME_T = "T"; +const std::string PADV2_ATTR_NAME_PAD_FORMAT = "pad_format"; +const std::string PADV2_ATTR_NAME_CONST_VALUE = "const_value"; + +// MirrorPad +const std::string MIRRORPAD_ATTR_NAME_MODE = "mode"; +const std::string MIRRORPAD_ATTR_NAME_PADS = "paddings"; +const std::string MIRRORPAD_ATTR_NAME_PAD_FORMAT = "pad_format"; +const std::string MIRRORPAD_ATTR_NAME_CONST_VALUE = "const_value"; // Filler const std::string FILLER_TYPE = "filler_type"; const std::string FILLER_VALUE = "filler_value"; @@ -559,9 +630,6 @@ const std::string SHUFFLE_CHANNEL_GROUP = "group"; // TopKV2 const std::string TOPKV2_ATTR_K = "k"; -const std::string DEPTH_SPACE_ATTR_BLOCK_SIZE = "block_size"; -const std::string L2_NORMALIZE_ATTR_EPS = "eps"; - // Calibaration const std::string STRIDE_H_INDEX = "STRIDE_H_INDEX"; const std::string STRIDE_W_INDEX = "STRIDE_W_INDEX"; @@ -616,6 +684,8 @@ const std::string ATTR_MODEL_STREAM_NUM = "stream_num"; const std::string ATTR_MODEL_EVENT_NUM = "event_num"; +const std::string ATTR_MODEL_LABEL_NUM = "label_num"; + const std::string ATTR_MODEL_MEMORY_SIZE = "memory_size"; const std::string ATTR_MODEL_WEIGHT_SIZE = "weight_size"; @@ -630,6 +700,8 @@ const std::string ATTR_MODEL_VAR_SIZE = "variable_size"; const std::string ATTR_MODEL_TASK_INDEX_OP_NAME = "task_index_op_name"; +const std::string ATTR_MODEL_CORE_TYPE = "core_type"; + // Public attribute const std::string ATTR_NAME_IMPLY_TYPE = "imply_type"; @@ -661,17 +733,145 @@ const std::string TARGET_TYPE_TINY = "TINY"; const std::string TARGET_TYPE_LITE = "LITE"; +// l2_normalize +const std::string L2_NORMALIZE_ATTR_AXIS = "axis"; +const std::string L2_NORMALIZE_ATTR_EPS = "eps"; + +const std::string POOL_PARAMA_ATTR_WINDOW = "window"; +const std::string POOL_PARAMA_ATTR_CEIL_MODE = "ceil_mode"; +const std::string POOL_PARAMA_ATTR_DATA_MODE = "data_mode"; +const std::string POOL_PARAMA_ATTR_GLOBAL_POOLING = "global_pooling"; +const std::string POOL_PARAMA_ATTR_NAN_OP = "nan_opt"; +const std::string POOL_PARAMA_ATTR_PAD_MOD = "pad_mode"; + +// HCOM +const std::string HCOM_ATTR_ROOT_RANK = "root_rank"; +const std::string HCOM_ATTR_RANK_SIZE = "rank_size"; + +const std::string HCOM_ATTR_REDUCE_TYPE = "reduction"; +const std::string HCOM_ATTR_GROUP = "group"; +const std::string HCOM_ATTR_SR_TAG = "sr_tag"; +const std::string HCOM_ATTR_SRC_RANK = "src_rank"; +const std::string HCOM_ATTR_DEST_RANK = "dest_rank"; +const std::string HCOM_ATTR_FUSION = "fusion"; +const std::string HCOM_ATTR_SHAPE = "shape"; +const std::string HCOM_ATTR_DATA_TYPE = "dtype"; + +// SpaceToDepth/DepthToSpace +const std::string ATTR_NAME_BLOCK_SIZE = "block_size"; + +// SparseSoftmaxCrossEntropyWithLogits +const std::string SPARSE_SOFT_MAX_ATTR_TLABLES = "Tlabels"; + +// MaxPoolGradWithArgmax +const std::string MAX_POOL_GRAD_OUTPUT_SHAPE = "max_pool_grad_output_shape"; + +// AvgPoolGrad +const std::string AVG_POOL_GRAD_OUTPUT_SHAPE = "avg_pool_grad_output_shape"; + +// Pad +const std::string ATTR_PAD_FORMAT = "attr_pad_format"; + +// Varible +const std::string VAR_ATTR_FORMAT = "_var_format"; +const std::string VAR_ATTR_NAME = "var_name"; +const std::string VAR_ATTR_FRACTALZ_FORMAT = "FZ"; +const std::string VAR_ATTR_4D_FORMAT = "4D"; +const std::string VAR_ATTR_5D_FORMAT = "5D"; +const std::string VAR_ATTR_DATA_TYPE = "data_format"; +const std::string VAR_ATTR_VAR_IN_NAME = "var_in_name"; +const std::string VAR_ATTR_VAR_IN_INDEX = "var_in_index"; +const std::string VAR_ATTR_VAR_OUT_INDEX = "var_out_index"; +const std::string VAR_ATTR_SHAPE = "shape"; +const std::string HALF_VAR_NAME_END = "_fp16"; +const std::string VAR_ATTR_INITED = "var_is_inited"; + +const std::string VAR_ATTR_CONTAINER = "container"; +const std::string VAR_ATTR_SHARED_NAME = "shared_name"; +const std::string VAR_ATTR_DTYPE = "dtype"; + +const std::string VAR_ATTR_SRC_VAR_NAME = "_src_var_name"; +const std::string VAR_ATTR_VAR_IS_SAVE = "_var_is_save"; +const std::string VAR_ATTR_VAR_IS_RESTORE = "_var_is_restore"; +const std::string VAR_ATTR_VAR_IS_BROADCAST = "_var_is_broadcast"; +const std::string REF_VAR_SRC_VAR_NAME = "ref_var_src_var_name"; +const std::string REF_VAR_PRE_PEER_OUT_INDEX = "ref_var_pre_peer_out_index"; + +// Assign +const std::string ASSIGN_VALIDATE_SHAPE = "validate_shape"; + +// space2bacth batch2space +const std::string BATCH_SPACE_ATTR_BLOCK = "block"; +const std::string BATCH_SPACE_ATTR_PADDING = "padding"; + +// depth_to_space space_to_depth +const std::string DEPTH_SPACE_ATTR_BLOCK_SIZE = "block_size"; + +// FakeQuantWithMinMaxVars +const std::string FakeQuantWithMinMaxVars_ATTR_MAX = "max"; +const std::string FakeQuantWithMinMaxVars_ATTR_MIN = "min"; + +// mobilenet_ssd_conv_fusion +const std::string SSD_BOXPREDICTOR_BOXES_FUSION = "ssd_boxpredictor_boxes_fusion"; +const std::string SSD_BOXPREDICTOR_SCORES_FUSION = "ssd_boxpredictor_scores_fusion"; +const std::string SSD_BOXPREDICTOR_FUSION_BOX_TYPE_NUM = "ssd_boxpredictor_fusion_box_type_num"; + +// lsh project +const std::string LSH_PROJ_TYPE = "lsh_project_type"; + +// log time stamp +const std::string LOG_TIME_STAMP_LOGID = "logid"; +const std::string LOG_TIME_STAMP_NOTIFY = "notify"; + +// ShapeN +const std::string SHAPEN_ATTR_N = "N"; +const std::string SHAPEN_ATTR_IN_TYPE = "in_type"; +const std::string SHAPEN_ATTR_OUT_TYPE = "dtype"; + +// GatherV2 attr def +const std::string GATHERV2_ATTR_NAME_TAXIS = "Taxis"; +const std::string GATHERV2_ATTR_NAME_TINDICES = "Tindices"; +const std::string GATHERV2_ATTR_NAME_TPARAMS = "Tparams"; + +// Reshape attr def +const std::string RESHAPE_ATTR_NAME_INPUT_DESC = "input_desc_reshape"; +const std::string RESHAPE_ATTR_NAME_OUTPUT_DESC = "output_desc_reshape"; + +// axis attr def +const std::string ATTR_NAME_AXIS_ORG_OP = "axis_org_op"; + +const std::string ATTR_NAME_LINK_WITH_SPARE = "link_with_sparse"; + +const std::string ATTR_NAME_NET_OUTPUT_FORMAT = "net_output_format"; +const std::string ATTR_NAME_NET_OUTPUT_DATATYPE = "net_output_datatype"; + +// For constant folding +const std::string ATTR_NO_NEED_CONSTANT_FOLDING = "no_need_constant_folding"; + const std::string ATTR_NAME_CONTINUOUS_INPUT = "continuous_input"; const std::string ATTR_NAME_CONTINUOUS_OUTPUT = "continuous_output"; const std::string ATTR_NAME_REFERENCE = "reference"; +const std::string ATTR_NAME_NOTASK = "_no_task"; + +const std::string ATTR_NAME_OUTPUT_REUSE_INPUT = "_output_reuse_input"; + +const std::string ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX = "_reuse_input_on_dim_index"; + +const std::string ATTR_NAME_NOPADDING_CONTINUOUS_INPUT = "_no_padding_continuous_input"; + +const std::string ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT = "_no_padding_continuous_output"; + const std::string ATTR_NAME_ATOMIC_INDEX = "atomic_index"; // Used for mark the active label list stream of activated node const std::string ATTR_NAME_ACTIVE_LABEL_LIST = "_active_label_list"; +// Used for l2cache, true: the memory of all inputs is used for the last time. +const std::string ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE = "is_end_of_inputmem_lifecycle"; + // Multi batch const std::string ATTR_NAME_PRED_VALUE = "_pred_value"; const std::string ATTR_NAME_BATCH_NUM = "_batch_num"; @@ -682,6 +882,8 @@ const std::string ATTR_NAME_STREAM_SWITCH_COND = "switch_condition"; const std::string ATTR_NAME_TRUE_BRANCH_STREAM = "true_branch_stream"; const std::string ATTR_NAME_ACTIVE_STREAM_LIST = "active_stream_list"; const std::string ATTR_NAME_SWITCHN_PRED_VALUE = "switch_pred_value"; +const std::string ATTR_NAME_ITERATORS_PER_LOOP = "iterations_per_loop"; +const std::string ATTR_NAME_FLOW_CTRL_NODE_FLAG = "is_flow_ctrl_node"; const std::string ATTR_NAME_SWITCH_BRANCH_NODE_LABEL = "_switch_branch_node_label"; const std::string ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG = "_switch_true_branch_flag"; @@ -691,6 +893,9 @@ const std::string ATTR_NAME_CYCLIC_DEPENDENCE_FLAG = "_cyclic_dependence_flag"; const std::string ATTR_NAME_NEXT_ITERATION = "_next_iteration_node"; +// Function Op +const std::string ATTR_NAME_PARENT_NODE_INDEX = "_parent_node_index"; + // Used for mark the active node is for loop, type:bool const std::string ATTR_NAME_IS_LOOP_ACTIVE = "is_loop_active"; @@ -702,6 +907,20 @@ const std::string ATTR_NAME_MEMORY_TYPE_WORKSPACE = "memory_type_workspace"; const std::string MODEL_ATTR_SESSION_ID = "session_id"; +// l1 fusion and other fusion in future +const std::string ATTR_NAME_L1_FUSION_GROUP_ID = "_l1_fusion_group_id"; +const std::string ATTR_NAME_L1_FUSION_GROUP_KEY = "_l1_fusion_group_key"; +const std::string ATTR_NAME_FUSION_VIRTUAL_OP = "_fusion_virtual_op"; +const std::string ATTR_NAME_FUSION_GROUP_TYPE = "_fusion_group_type"; +const std::string ATTR_NAME_INPUT_MEM_TYPE_LIST = "_input_memory_type"; +const std::string ATTR_NAME_OUTPUT_MEM_TYPE_LIST = "_output_memory_type"; +const std::string ATTR_NAME_L1_FUSION_EXTEND_PTR = "_l1_fusion_extend_content"; +const std::string ATTR_NAME_GET_TENSOR_ACTUAL_SIZE = "_tensor_actual_size"; +const std::string ATTR_NAME_OUTPUT_OFFSET_FOR_L1_FUSION = "_output_offset_for_l1_fuison"; +const std::string ATTR_NAME_SWITCH_FOR_L1_FUSION = "_enable_l1_fusion"; +const std::string ATTR_N_BATCH_SPILT = "_is_n_batch_split"; +const std::string ATTR_NO_TASK_AND_DUMP_NEEDED = "_no_task_and_dump_needed"; + // Atomic addr clean attrs const std::string ATOMIC_ATTR_INPUT_INDEX = "atomic_input_index"; const std::string ATOMIC_ATTR_OUTPUT_INDEX = "atomic_output_index"; @@ -722,6 +941,9 @@ const std::string ATTR_INSERT_BY_MBATCH = "mbatch-inserted-node"; // For inserted op const std::string ATTR_INSERTED_BY_GE = "_inserted_by_ge"; +// For compress weight +const std::string ATTR_NAME_COMPRESS_WEIGHT = "_is_compress_weight"; + // For data dump const std::string ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES = "_datadump_original_op_names"; const std::string ATTR_NAME_DATA_DUMP_IS_MULTIOP = "_datadump_is_multiop"; @@ -732,24 +954,17 @@ const std::string ATTR_NAME_DATA_DUMP_ORIGIN_OUTPUT_INDEX = "_datadump_origin_ou const std::string ATTR_NAME_DATA_DUMP_ORIGIN_FORMAT = "_datadump_origin_format"; const std::string ATTR_NAME_DATA_DUMP_ORIGIN_DATA_TYPE = "_datadump_origin_data_type"; -// Variable -const std::string REF_VAR_SRC_VAR_NAME = "ref_var_src_var_name"; -const std::string VAR_ATTR_SRC_VAR_NAME = "_src_var_name"; -const std::string REF_VAR_PRE_PEER_OUT_INDEX = "ref_var_pre_peer_out_index"; -const std::string VAR_ATTR_VAR_IS_BROADCAST = "_var_is_broadcast"; -const std::string VAR_ATTR_VAR_IS_RESTORE = "_var_is_restore"; - -// HCOM -const std::string HCOM_ATTR_ROOT_RANK = "root_rank"; -const std::string HCOM_ATTR_RANK_SIZE = "rank_size"; -const std::string HCOM_ATTR_SHAPE = "shape"; -const std::string HCOM_ATTR_DATA_TYPE = "dtype"; +// functional ops attr +const std::string ATTR_NAME_TCOND = "Tcond"; +const std::string ATTR_NAME_TIN = "Tin"; +const std::string ATTR_NAME_TOUT = "Tout"; +const std::string ATTR_NAME_THEN_BRANCH = "then_branch"; +const std::string ATTR_NAME_ELSE_BRANCH = "else_branch"; -const std::string HCOM_ATTR_REDUCE_TYPE = "reduction"; +// used for label switch +const std::string ATTR_NAME_LABEL_SWITCH_INDEX = "_label_switch_index"; +const std::string ATTR_NAME_LABEL_SWITCH_LIST = "_label_switch_list"; const std::string ATTR_NAME_INPUT_DATATYPE = "input_datatype"; const std::string ATTR_NAME_OUTPUT_DATATYPE = "output_datatype"; - -// Dynamic stitch -const std::string DYNAMIC_STITCH_ATTR_NAME_NUM = "DynamicStitchN_"; } // namespace ge diff --git a/src/common/graph/ge_attr_value.cc b/src/common/graph/ge_attr_value.cc index 8eb91606..0a2893a4 100644 --- a/src/common/graph/ge_attr_value.cc +++ b/src/common/graph/ge_attr_value.cc @@ -22,7 +22,7 @@ #include "graph/model_serialize.h" #include "proto/ge_ir.pb.h" #include "detail/model_serialize_imp.h" -#include "graph/debug/ge_attr_define.h" +#include "debug/ge_attr_define.h" #include "debug/ge_log.h" #include "debug/ge_util.h" @@ -53,7 +53,7 @@ string GeAttrValue::NamedAttrs::GetName() const { GeAttrValue GeAttrValue::NamedAttrs::GetItem(const string &key) const { GeAttrValue value; - (void)GetAttr(key, value); + GetAttr(key, value); return value; } @@ -1081,6 +1081,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool AttrUtils::GetListInt(ConstA if (!GetListInt(std::move(obj), name, int64_list)) { return false; } + for (size_t i = 0; i < int64_list.size(); ++i) { if (int64_list[i] > INT32_MAX) { GELOGE(GRAPH_FAILED, "index %zu %ld int64_t value cannot cast to int32_t", i, int64_list[i]); @@ -1098,6 +1099,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool AttrUtils::GetListInt(ConstA if (!GetListInt(std::move(obj), name, int64_list)) { return false; } + for (size_t i = 0; i < int64_list.size(); ++i) { if (int64_list[i] > UINT32_MAX) { GELOGE(GRAPH_FAILED, "index %zu %ld int64_t value cannot cast to uint32_t", i, int64_list[i]); @@ -1215,6 +1217,23 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr AttrUtils::CloneOpDesc( GE_CHK_BOOL_EXEC(imp.UnserializeOpDesc(op_desc, *op_def), return op_desc, "op_desc unserialize failed"); op_desc->extAttrs_ = org_op_desc->extAttrs_; + if (op_desc->HasAttr("_input_name_idx_key")) { + if (op_desc->DelAttr("_input_name_idx_key") != SUCCESS) { + GELOGE(GRAPH_FAILED, "DelAttr _input_name_idx_key failed."); + } + } + + if (op_desc->HasAttr("_input_name_idx_value")) { + if (op_desc->DelAttr("_input_name_idx_value") != SUCCESS) { + GELOGE(GRAPH_FAILED, "DelAttr _input_name_idx_value failed."); + } + } + + if (op_desc->HasAttr("_opt_input")) { + if (op_desc->DelAttr("_opt_input") != SUCCESS) { + GELOGE(GRAPH_FAILED, "DelAttr _opt_input failed."); + } + } return op_desc; } @@ -1237,11 +1256,6 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr AttrUtils::CopyOpDesc(c op_desc->extAttrs_ = org_op_desc->extAttrs_; - op_desc->input_name_idx_.insert(org_op_desc->input_name_idx_.begin(), org_op_desc->input_name_idx_.end()); - op_desc->optional_input_names_.insert(org_op_desc->optional_input_names_.begin(), - org_op_desc->optional_input_names_.end()); - op_desc->output_name_idx_.insert(org_op_desc->output_name_idx_.begin(), org_op_desc->output_name_idx_.end()); - op_desc->output_name_idx_.insert(org_op_desc->output_name_idx_.begin(), org_op_desc->output_name_idx_.end()); op_desc->infer_func_ = org_op_desc->infer_func_; @@ -1250,4 +1264,25 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr AttrUtils::CopyOpDesc(c return op_desc; } +std::string AttrUtils::GetAllAttrsStr(AttrUtils::ConstAttrHolderAdapter &&obj) { + auto holder = obj.get(); + if (holder == nullptr) { + return ""; + } + auto attrs_map = holder->GetAttrMap(); + if (attrs_map.GetProtoMsg() == nullptr) { + return ""; + } + + std::map ordered_attrs; + for (auto &attr : *(attrs_map.GetProtoMsg())) { + ordered_attrs[attr.first] = attr.second.SerializeAsString(); + } + + std::stringstream ss; + for (auto &attr : ordered_attrs) { + ss << attr.first << ":" << attr.second << ";"; + } + return ss.str(); +} } // namespace ge diff --git a/src/common/graph/ge_tensor.cc b/src/common/graph/ge_tensor.cc index d5def041..ccf757fa 100644 --- a/src/common/graph/ge_tensor.cc +++ b/src/common/graph/ge_tensor.cc @@ -163,6 +163,34 @@ int64_t GeShape::GetShapeSize() const { return res; } +/// +/// @brief Check is unknown shape +/// @return bool +/// /// +bool GeShape::IsUnknownShape() const { + auto proto_msg = shape_def_.GetProtoMsg(); + if (proto_msg != nullptr) { + for (auto i : proto_msg->dim()) { + if (i < 0) { + return true; + } + } + } + return false; +} + +/// +/// @brief Check is a scalar +/// @return bool +/// +bool GeShape::IsScalar() const { + auto proto_msg = shape_def_.GetProtoMsg(); + if (proto_msg != nullptr) { + return proto_msg->dim().empty(); + } + return false; +} + const string TENSOR_UTILS_SIZE = "size"; const string TENSOR_UTILS_WEIGHT_SIZE = "weight_size"; const string TENSOR_UTILS_REUSE_INPUT = "reuse_input"; @@ -639,14 +667,14 @@ GeTensor &GeTensor::operator=(const GeTensor &other) { } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus TensorUtils::GetSize(const GeTensorDesc &tensor_desc, - uint32_t &size) { + int64_t &size) { auto tensor_descriptor_msg = tensor_desc.tensor_descriptor_.GetProtoMsg(); GE_CHECK_NOTNULL(tensor_descriptor_msg); - size = static_cast(tensor_descriptor_msg->size()); + size = static_cast(tensor_descriptor_msg->size()); return GRAPH_SUCCESS; } -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void TensorUtils::SetSize(GeTensorDesc &tensor_desc, uint32_t size) { +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void TensorUtils::SetSize(GeTensorDesc &tensor_desc, int64_t size) { auto tensor_descriptor_msg = tensor_desc.tensor_descriptor_.GetProtoMsg(); if (tensor_descriptor_msg != nullptr) { tensor_descriptor_msg->set_size(size); diff --git a/src/common/graph/model.cc b/src/common/graph/model.cc index fac09670..a3628204 100644 --- a/src/common/graph/model.cc +++ b/src/common/graph/model.cc @@ -49,6 +49,7 @@ void Model::Init() { (void)AttrUtils::SetInt(this, ATTR_MODEL_MEMORY_SIZE, 0); (void)AttrUtils::SetInt(this, ATTR_MODEL_STREAM_NUM, 0); (void)AttrUtils::SetInt(this, ATTR_MODEL_EVENT_NUM, 0); + (void)AttrUtils::SetInt(this, ATTR_MODEL_LABEL_NUM, 0); (void)AttrUtils::SetInt(this, ATTR_MODEL_WEIGHT_SIZE, 0); (void)AttrUtils::SetStr(this, ATTR_MODEL_TARGET_TYPE, TARGET_TYPE_MINI); version_ = 0; @@ -77,9 +78,9 @@ void Model::SetGraph(const ge::Graph &graph) { graph_ = graph; } Graph Model::GetGraph() const { return graph_; } -graphStatus Model::Save(Buffer &buffer) const { +graphStatus Model::Save(Buffer &buffer, bool is_dump) const { ModelSerialize serialize; - buffer = serialize.SerializeModel(*this); + buffer = serialize.SerializeModel(*this, is_dump); return buffer.GetSize() > 0 ? GRAPH_SUCCESS : GRAPH_FAILED; } @@ -113,7 +114,7 @@ graphStatus Model::SaveToFile(const string &file_name) const { } int fd = open(real_path, O_WRONLY | O_CREAT | O_TRUNC, ACCESS_PERMISSION_BITS); if (fd < 0) { - GELOGE(GRAPH_FAILED, "open file failed, file path [%s] ", real_path); + GELOGE(GRAPH_FAILED, "open file failed, file path [%s], %s ", real_path, strerror(errno)); return GRAPH_FAILED; } bool ret = ge_proto.SerializeToFileDescriptor(fd); @@ -129,6 +130,10 @@ graphStatus Model::SaveToFile(const string &file_name) const { GELOGE(GRAPH_FAILED, "close file descriptor fail."); return GRAPH_FAILED; } + if (!ret) { + GELOGE(GRAPH_FAILED, "function [SerializeToFileDescriptor] failed"); + return GRAPH_FAILED; + } } return GRAPH_SUCCESS; } @@ -152,7 +157,7 @@ graphStatus Model::LoadFromFile(const string &file_name) { } int fd = open(real_path, O_RDONLY); if (fd < 0) { - GELOGE(GRAPH_FAILED, "open file failed"); + GELOGE(GRAPH_FAILED, "open file failed, %s", strerror(errno)); return GRAPH_FAILED; } @@ -170,6 +175,10 @@ graphStatus Model::LoadFromFile(const string &file_name) { GELOGE(GRAPH_FAILED, "close file descriptor fail."); return GRAPH_FAILED; } + if (!ret) { + GELOGE(GRAPH_FAILED, "function [ParseFromFileDescriptor] failed"); + return GRAPH_FAILED; + } return Load(model_def); } diff --git a/src/common/graph/model_serialize.cc b/src/common/graph/model_serialize.cc index f92ebf1e..0ec4a2eb 100644 --- a/src/common/graph/model_serialize.cc +++ b/src/common/graph/model_serialize.cc @@ -15,10 +15,8 @@ */ #include "graph/model_serialize.h" - #include #include - #include "debug/ge_attr_define.h" #include "debug/ge_log.h" #include "debug/ge_util.h" @@ -26,6 +24,7 @@ #include "graph/detail/model_serialize_imp.h" #include "proto/ge_ir.pb.h" #include "utils/graph_utils.h" +#include "debug/ge_op_types.h" using std::string; @@ -84,20 +83,29 @@ bool ModelSerializeImp::SerializeEdge(const NodePtr &node, proto::OpDef *op_def_ return true; } -bool ModelSerializeImp::SerializeOpDesc(const ConstOpDescPtr &op_desc, proto::OpDef *op_def_proto) { +bool ModelSerializeImp::SerializeOpDesc(const ConstOpDescPtr &op_desc, proto::OpDef *op_def_proto, bool is_dump) { if (op_desc == nullptr || op_def_proto == nullptr) { GELOGE(GRAPH_FAILED, "Input Para Invalid"); return false; } if (op_desc->op_def_.GetProtoMsg() != nullptr) { *op_def_proto = *op_desc->op_def_.GetProtoMsg(); + // Delete unnecessary attr + if (is_dump) { + auto attr = op_def_proto->mutable_attr(); + attr->erase(ATTR_NAME_FRAMEWORK_NODE_DEF); + attr->erase(ATTR_NAME_FRAMEWORK_OP_DEF); + attr->erase(ATTR_NAME_FRAMEWORK_FUNC_DEF); + GE_IF_BOOL_EXEC((op_def_proto->type() == CONSTANT || op_def_proto->type() == CONSTANTOP), + attr->erase(ATTR_NAME_WEIGHTS)); + } op_def_proto->clear_input_desc(); op_def_proto->clear_output_desc(); // Input descs - if (op_desc->GetInputsSize() > 0) { - auto size = static_cast(op_desc->GetInputsSize()); + if (op_desc->GetAllInputsSize() > 0) { + auto size = static_cast(op_desc->GetAllInputsSize()); for (uint32_t i = 0; i < size; i++) { - auto tensor_desc = op_desc->GetInputDescPtr(i); + auto tensor_desc = op_desc->GetInputDescPtrDfault(i); if (tensor_desc != nullptr && tensor_desc->tensor_descriptor_.GetProtoMsg() != nullptr) { *op_def_proto->add_input_desc() = *(tensor_desc->tensor_descriptor_.GetProtoMsg()); } @@ -117,12 +125,12 @@ bool ModelSerializeImp::SerializeOpDesc(const ConstOpDescPtr &op_desc, proto::Op return true; } -bool ModelSerializeImp::SerializeNode(const NodePtr &node, proto::OpDef *op_def_proto) { +bool ModelSerializeImp::SerializeNode(const NodePtr &node, proto::OpDef *op_def_proto, bool is_dump) { if (node == nullptr || op_def_proto == nullptr) { GELOGE(GRAPH_FAILED, "Input Para Node Invalid"); return false; } - if (!SerializeOpDesc(node->GetOpDesc(), op_def_proto)) { + if (!SerializeOpDesc(node->GetOpDesc(), op_def_proto, is_dump)) { GELOGE(GRAPH_FAILED, "Serialize OpDesc failed"); return false; } @@ -134,7 +142,8 @@ bool ModelSerializeImp::SerializeNode(const NodePtr &node, proto::OpDef *op_def_ } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ModelSerializeImp::SerializeGraph(const ConstComputeGraphPtr &graph, - proto::GraphDef *graph_proto) { + proto::GraphDef *graph_proto, + bool is_dump) { if (graph == nullptr || graph_proto == nullptr) { GELOGE(GRAPH_FAILED, "Input para Invalid"); return false; @@ -156,7 +165,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ModelSerializeImp::Serialize *graph_proto->mutable_attr() = *graph->attrs_.GetProtoMsg(); } for (const auto &node : graph->GetDirectNode()) { - if (!SerializeNode(node, graph_proto->add_op())) { + if (!SerializeNode(node, graph_proto->add_op(), is_dump)) { if (node->GetOpDesc() != nullptr) { GELOGE(GRAPH_FAILED, "Serialize Node %s failed", node->GetName().c_str()); } @@ -166,7 +175,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool ModelSerializeImp::Serialize return true; } -bool ModelSerializeImp::SerializeModel(const Model &model, proto::ModelDef *model_proto) { +bool ModelSerializeImp::SerializeModel(const Model &model, proto::ModelDef *model_proto, bool is_dump) { if (model_proto == nullptr) { GELOGE(GRAPH_FAILED, "model_proto para Invalid"); return false; @@ -183,7 +192,7 @@ bool ModelSerializeImp::SerializeModel(const Model &model, proto::ModelDef *mode GELOGE(GRAPH_FAILED, "GetComputeGraph return nullptr"); return false; } - if (!SerializeGraph(compute_graph, model_proto->add_graph())) { + if (!SerializeGraph(compute_graph, model_proto->add_graph(), is_dump)) { GELOGE(GRAPH_FAILED, "SerializeGraph fail"); return false; } @@ -390,10 +399,10 @@ bool ReadProtoFromBinaryFile(const uint8_t *data, size_t len, google::protobuf:: return true; } -Buffer ModelSerialize::SerializeModel(const Model &model) { +Buffer ModelSerialize::SerializeModel(const Model &model, bool is_dump) { proto::ModelDef model_def; ModelSerializeImp imp; - if (!imp.SerializeModel(model, &model_def)) { + if (!imp.SerializeModel(model, &model_def, is_dump)) { return Buffer(); } #if !defined(__ANDROID__) && !defined(ANDROID) diff --git a/src/common/graph/node.cc b/src/common/graph/node.cc index 42558ddf..1c8f327b 100644 --- a/src/common/graph/node.cc +++ b/src/common/graph/node.cc @@ -401,7 +401,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Node::Vistor Node::Get vec.push_back(in_anchor); } } - // Push back in_control_anchor_ + // Push back in_control_anchor_ if ((in_control_anchor_->GetPeerOutControlAnchors().size() > 0) || (in_control_anchor_->GetPeerOutDataAnchors().size() > 0)) { auto in_anchor = Anchor::DynamicAnchorCast(in_control_anchor_); @@ -512,7 +512,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Node::Vistor Node::GetIn auto peer_out_anchors = in_control_anchor_->GetPeerOutDataAnchors(); for (const auto &out_anchor : peer_out_anchors) { - GE_CHK_BOOL_EXEC(out_anchor != nullptr, continue, " in_control_anchor_ peer out data anchors is nullptr"); + GE_CHK_BOOL_EXEC(out_anchor != nullptr, continue, "in_control_anchor_ peer out data anchors is nullptr"); auto node = out_anchor->GetOwnerNode(); GE_CHK_BOOL_EXEC(node != nullptr, continue, "GetOwnerNode is nullptr"); vec.push_back(node); @@ -521,7 +521,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Node::Vistor Node::GetIn auto peer_out_control_anchors = in_control_anchor_->GetPeerOutControlAnchors(); for (const auto &out_control_anchor : peer_out_control_anchors) { GE_CHK_BOOL_EXEC(out_control_anchor != nullptr, continue, - " in_control_anchor_ peer out control anchors is nullptr"); + "in_control_anchor_ peer out control anchors is nullptr"); auto node = out_control_anchor->GetOwnerNode(); GE_CHK_BOOL_EXEC(node != nullptr, continue, "GetOwnerNode is nullptr"); vec.push_back(node); @@ -785,6 +785,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus Node::UpdateOpDesc(co GE_CHK_BOOL_EXEC(op_->GetInputsSize() == op_desc->GetInputsSize(), return GRAPH_PARAM_INVALID, "Inputs count expected to be same, orginial OpDesc %zu, Param OpDesc %zu", op_->GetInputsSize(), op_desc->GetInputsSize()); + GE_CHK_BOOL_EXEC(op_->GetOutputsSize() == op_desc->GetOutputsSize(), return GRAPH_PARAM_INVALID, "Outputs count expected to be same, orginial OpDesc %zu, Param OpDesc %zu", op_->GetOutputsSize(), op_desc->GetOutputsSize()); diff --git a/src/common/graph/op_desc.cc b/src/common/graph/op_desc.cc index e6184ed3..620c815c 100644 --- a/src/common/graph/op_desc.cc +++ b/src/common/graph/op_desc.cc @@ -61,6 +61,12 @@ const std::string ATTR_NAME_WORKSPACE_BYTES = "workspace_bytes"; const std::string ATTR_NAME_IS_INPUT_CONST = "is_input_const"; +const std::string ATTR_NAME_OPT_INPUT = "_opt_input"; + +const std::string ATTR_NAME_INPUT_NAME_IDX_KEY = "_input_name_idx_key"; + +const std::string ATTR_NAME_INPUT_NAME_IDX_VALUE = "_input_name_idx_value"; + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDesc::OpDesc() { op_def_.InitDefault(); if (op_def_.GetProtoMsg() != nullptr) { @@ -202,7 +208,8 @@ graphStatus OpDesc::AddInputDesc(uint32_t index, const ge::GeTensorDesc &input_d } graphStatus OpDesc::AddInputDesc(const string &name, const ge::GeTensorDesc &input_desc) { - if (input_name_idx_.find(name) != input_name_idx_.end()) { + auto input_name_idx = GetAllInputName(); + if (input_name_idx.find(name) != input_name_idx.end()) { GELOGI("input %s is exist, update it", name.c_str()); graphStatus ret = UpdateInputDesc(name, input_desc); return ret; @@ -214,15 +221,17 @@ graphStatus OpDesc::AddInputDesc(const string &name, const ge::GeTensorDesc &inp return GRAPH_FAILED; } inputs_desc_.push_back(in_desc); - (void)input_name_idx_.insert(make_pair(name, index)); + (void)input_name_idx.insert(make_pair(name, index)); + SetAllInputName(input_name_idx); return GRAPH_SUCCESS; } } graphStatus OpDesc::AddInputDescForward(const string &name, const unsigned int num) { + auto input_name_idx = GetAllInputName(); for (unsigned int i = 0; i < num; i++) { string input_name = name + std::to_string(i); - GE_CHK_BOOL_RET_STATUS((input_name_idx_.find(input_name) == input_name_idx_.end()), GRAPH_FAILED, + GE_CHK_BOOL_RET_STATUS((input_name_idx.find(input_name) == input_name_idx.end()), GRAPH_FAILED, "Add input tensor_desc is existed. name[%s]", input_name.c_str()); std::shared_ptr in_desc = ComGraphMakeShared(GeTensorDesc()); @@ -234,12 +243,13 @@ graphStatus OpDesc::AddInputDescForward(const string &name, const unsigned int n (void)inputs_desc_.insert(inputs_desc_.begin(), in_desc); // Update index in input_name_idx - for (auto it = input_name_idx_.begin(); it != input_name_idx_.end(); ++it) { + for (auto it = input_name_idx.begin(); it != input_name_idx.end(); ++it) { it->second += 1; } - (void)input_name_idx_.insert(make_pair(input_name, 0)); + (void)input_name_idx.insert(make_pair(input_name, 0)); } + SetAllInputName(input_name_idx); return GRAPH_SUCCESS; } @@ -270,10 +280,19 @@ graphStatus OpDesc::AddOutputDescForward(const string &name, const unsigned int graphStatus OpDesc::AddOptionalInputDesc(const string &name, const ge::GeTensorDesc &input_desc) { if (OpDesc::AddInputDesc(name, input_desc) == GRAPH_FAILED) return GRAPH_FAILED; - (void)optional_input_names_.insert(name); + vector optional_input_names; + (void)AttrUtils::GetListStr(this, ATTR_NAME_OPT_INPUT, optional_input_names); + optional_input_names.push_back(name); + (void)AttrUtils::SetListStr(this, ATTR_NAME_OPT_INPUT, optional_input_names); return GRAPH_SUCCESS; } +std::vector OpDesc::GetAllOptionalInputName() const { + vector optional_input_names; + (void)AttrUtils::GetListStr(this, ATTR_NAME_OPT_INPUT, optional_input_names); + return optional_input_names; +} + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDesc::UpdateInputDesc(uint32_t index, const ge::GeTensorDesc &tensor_Desc) { GE_CHK_BOOL_RET_STATUS((index < inputs_desc_.size()), GRAPH_FAILED, "The index is invalid. index[%u]", index); @@ -288,11 +307,12 @@ OpDesc::UpdateInputDesc(uint32_t index, const ge::GeTensorDesc &tensor_Desc) { } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool OpDesc::OpDescMembersAreEqual(const OpDesc &r_op_desc) const { - return (IsEqual(this->input_name_idx_, r_op_desc.input_name_idx_, "OpDesc.input_name_idx_") && - IsEqual(this->output_name_idx_, r_op_desc.output_name_idx_, "OpDesc.output_name_idx_") && - IsEqual(this->optional_input_names_, r_op_desc.optional_input_names_, "OpDesc.optional_input_names_") && - IsEqual(this->engine_name_, r_op_desc.engine_name_, "OpDesc.engine_name_") && - IsEqual(this->op_kernel_lib_name_, r_op_desc.op_kernel_lib_name_, "OpDesc.op_kernel_lib_name_")); + return ( + IsEqual(this->GetAllInputName(), r_op_desc.GetAllInputName(), "OpDesc.GetAllInputName()") && + IsEqual(this->output_name_idx_, r_op_desc.output_name_idx_, "OpDesc.output_name_idx_") && + IsEqual(this->GetAllOptionalInputName(), r_op_desc.GetAllOptionalInputName(), "OpDesc.GetAllOptionalInputName()") && + IsEqual(this->engine_name_, r_op_desc.engine_name_, "OpDesc.engine_name_") && + IsEqual(this->op_kernel_lib_name_, r_op_desc.op_kernel_lib_name_, "OpDesc.op_kernel_lib_name_")); } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool OpDesc::OpDescAttrsAreEqual(const OpDesc &r_op_desc) const { @@ -366,8 +386,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool OpDesc::operator==(const OpD } graphStatus OpDesc::UpdateInputDesc(const string &name, const ge::GeTensorDesc &tensor_Desc) { - auto it = input_name_idx_.find(name); - if (it == input_name_idx_.end()) { + auto input_name_idx = GetAllInputName(); + auto it = input_name_idx.find(name); + if (it == input_name_idx.end()) { GELOGW("Cann't find the input desc. name[%s]", name.c_str()); return GRAPH_FAILED; } @@ -387,8 +408,9 @@ graphStatus OpDesc::UpdateInputDesc(const string &name, const ge::GeTensorDesc & } bool OpDesc::InputIsSet(const string &name) const { - auto it = input_name_idx_.find(name); - if (it != input_name_idx_.end()) { + auto input_name_idx = GetAllInputName(); + auto it = input_name_idx.find(name); + if (it != input_name_idx.end()) { GE_IF_BOOL_EXEC(it->second >= inputs_desc_.size(), GELOGE(GRAPH_FAILED, "it->second is invalid."); return false); auto tensor_desc = inputs_desc_[it->second]; GE_IF_BOOL_EXEC(tensor_desc == nullptr, GELOGE(GRAPH_FAILED, "tensor_desc is null."); return false); @@ -406,18 +428,20 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeTensorDesc OpDesc::GetInputDesc } GeTensorDesc OpDesc::GetInputDesc(const string &name) const { - auto it = input_name_idx_.find(name); - GE_CHK_BOOL_RET_STATUS_NOLOG(it != input_name_idx_.end(), GeTensorDesc()); + auto input_name_idx = GetAllInputName(); + auto it = input_name_idx.find(name); + GE_CHK_BOOL_RET_STATUS_NOLOG(it != input_name_idx.end(), GeTensorDesc()); GE_CHK_BOOL_RET_STATUS_NOLOG(it->second < inputs_desc_.size(), GeTensorDesc()); return *(inputs_desc_[it->second].get()); } GE_FUNC_HOST_VISIBILITY OpDesc::Vistor OpDesc::GetAllInputNames() const { + auto input_name_idx = GetAllInputName(); vector names; - if (input_name_idx_.empty()) { + if (input_name_idx.empty()) { return OpDesc::Vistor(shared_from_this(), names); } - for (std::pair input : input_name_idx_) { + for (std::pair input : input_name_idx) { names.push_back(input.first); } return OpDesc::Vistor(shared_from_this(), names); @@ -483,6 +507,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY size_t OpDesc::GetInputsSize() co return size; } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY size_t OpDesc::GetAllInputsSize() const { return inputs_desc_.size(); } + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDesc::AddOutputDesc(const ge::GeTensorDesc &output_desc) { int index = static_cast(outputs_desc_.size()); return AddOutputDesc("__output" + std::to_string(index), output_desc); @@ -548,6 +574,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeTensorDescPtr OpDesc::MutableOu return outputs_desc_[index]; } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY uint32_t OpDesc::GetAllOutputsDescSize() const { + return static_cast(outputs_desc_.size()); +} + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDesc::Vistor OpDesc::GetAllOutputsDesc() const { vector temp{}; for (const auto &it : outputs_desc_) { @@ -580,6 +610,19 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetI } } +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr +OpDesc::GetInputDescPtrDfault(uint32_t index) const { + GE_CHK_BOOL_RET_STATUS_NOLOG((index) < (uint32_t)(inputs_desc_.size()), nullptr); + return inputs_desc_[(int32_t)index]; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ConstGeTensorDescPtr OpDesc::GetInputDescPtr(const string &name) const { + auto input_name_idx = GetAllInputName(); + auto it = input_name_idx.find(name); + GE_CHK_BOOL_RET_STATUS_NOLOG(it != input_name_idx.end(), shared_ptr()); + return inputs_desc_[it->second]; +} + graphStatus OpDesc::AddDynamicInputDesc(const string &name, const unsigned int num, bool is_push_back) { if (is_push_back) { for (unsigned int i = 0; i < num; i++) { @@ -603,12 +646,45 @@ graphStatus OpDesc::AddDynamicOutputDesc(const string &name, const unsigned int } bool OpDesc::IsOptionalInput(const string &name) const { - return optional_input_names_.find(name) != optional_input_names_.end(); + vector optional_input_names; + (void)AttrUtils::GetListStr(this, ATTR_NAME_OPT_INPUT, optional_input_names); + for (auto &item : optional_input_names) { + if (item == name) { + return true; + } + } + return false; } bool OpDesc::IsOptionalInput(uint32_t index) const { return IsOptionalInput(GetInputNameByIndex(index)); } -std::map OpDesc::GetAllInputName() { return input_name_idx_; } +std::map OpDesc::GetAllInputName() const { + std::map input_name_idx; + std::vector key; + std::vector value; + (void)AttrUtils::GetListStr(this, ATTR_NAME_INPUT_NAME_IDX_KEY, key); + (void)AttrUtils::GetListInt(this, ATTR_NAME_INPUT_NAME_IDX_VALUE, value); + + if (key.size() != value.size()) { + GE_LOGE("twe vector size is different. key_size: %zu, value_size: %zu.", key.size(), value.size()); + } else { + for (uint32_t i = 0; i < key.size(); ++i) { + input_name_idx.insert(std::pair(key.at(i), value.at(i))); + } + } + return input_name_idx; +} + +void OpDesc::SetAllInputName(const std::map &input_name_idx) { + std::vector key; + std::vector value; + for (auto &item : input_name_idx) { + key.emplace_back(item.first); + value.emplace_back(item.second); + } + (void)AttrUtils::SetListStr(this, ATTR_NAME_INPUT_NAME_IDX_KEY, key); + (void)AttrUtils::SetListInt(this, ATTR_NAME_INPUT_NAME_IDX_VALUE, value); +} std::map OpDesc::GetAllOutputName() { return output_name_idx_; } @@ -619,6 +695,7 @@ bool OpDesc::UpdateInputName(std::map input_name_idx) { auto factory_map_size = input_name_idx.size(); // It indicates that some inputs have no optionalname. // The redundant optionalname of factory needs to be deleted and then assigned + auto all_input_name_idx = GetAllInputName(); if (input_map_size < factory_map_size) { GELOGI("UpdateInputName org inputname map size: %zu, factory inputname map size: %zu", input_map_size, factory_map_size); @@ -631,22 +708,23 @@ bool OpDesc::UpdateInputName(std::map input_name_idx) { } if (input_name_idx.size() == input_map_size) { GELOGI("UpdateInputName"); - input_name_idx_ = input_name_idx; + all_input_name_idx = input_name_idx; } else { ret = false; GELOGW("after UpdateInputName factoryName map size : %zu", input_name_idx.size()); } } else if (input_map_size == factory_map_size) { - input_name_idx_ = input_name_idx; + all_input_name_idx = input_name_idx; } else { ret = false; GELOGW("org inputname map size: %zu, factory inputname map size: %zu", input_map_size, factory_map_size); } + SetAllInputName(all_input_name_idx); return ret; } bool OpDesc::UpdateOutputName(std::map output_name_idx) { - size_t output_map_size = GetAllOutputsDesc().size(); + size_t output_map_size = GetAllOutputsDescSize(); size_t factory_map_size = output_name_idx.size(); if (output_map_size < factory_map_size) { GELOGI("UpdateOutputName org outputname map size: %zu, factory outputname map size: %zu", output_map_size, @@ -754,17 +832,17 @@ graphStatus OpDesc::OpVerify() { } graphStatus OpDesc::CommonVerify() const { - for (string iname : GetAllInputNames()) { + for (const string &iname : GetAllInputNames()) { // Checking shape of all inputs - vector ishape = GetInputDesc(iname).GetShape().GetDims(); + vector ishape = GetInputDescPtr(iname)->GetShape().GetDims(); for (int64_t dim : ishape) { GE_CHK_BOOL_RET_STATUS(dim >= -1, GRAPH_FAILED, "operator input %s shape contains negative or zero dimension.", iname.c_str()); } } // Check all attributes defined - const auto all_attributes = GetAllAttrs(); - for (const auto name : GetAllAttrNames()) { + const auto &all_attributes = GetAllAttrs(); + for (const auto &name : GetAllAttrNames()) { GE_CHK_BOOL_RET_STATUS(all_attributes.find(name) != all_attributes.end(), GRAPH_FAILED, "operator attribute %s is empty.", name.c_str()); } @@ -773,19 +851,21 @@ graphStatus OpDesc::CommonVerify() const { } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY string OpDesc::GetInputNameByIndex(uint32_t index) const { - auto it = input_name_idx_.begin(); - for (; it != input_name_idx_.end(); ++it) { + auto input_name_idx = GetAllInputName(); + auto it = input_name_idx.begin(); + for (; it != input_name_idx.end(); ++it) { if (it->second == index) { break; } } - GE_CHK_BOOL_RET_STATUS_NOLOG(it != input_name_idx_.end(), ""); + GE_CHK_BOOL_RET_STATUS_NOLOG(it != input_name_idx.end(), ""); return it->first; } int OpDesc::GetInputIndexByName(const string &name) const { - auto it_find = input_name_idx_.find(name); - GE_CHK_BOOL_RET_STATUS_NOLOG(it_find != input_name_idx_.end(), -1); + auto input_name_idx = GetAllInputName(); + auto it_find = input_name_idx.find(name); + GE_CHK_BOOL_RET_STATUS_NOLOG(it_find != input_name_idx.end(), -1); return static_cast(it_find->second); } @@ -1065,10 +1145,12 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector OpDesc::GetIsInputCo GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDesc::RestoreInputNameIdx(const string &name, const int &index) { - if (input_name_idx_.find(name) != input_name_idx_.end()) { + auto input_name_idx = GetAllInputName(); + if (input_name_idx.find(name) != input_name_idx.end()) { GELOGI("Restore input name index is existed. name[%s]", name.c_str()); } - (void)input_name_idx_.insert(make_pair(name, index)); + (void)input_name_idx.insert(make_pair(name, index)); + SetAllInputName(input_name_idx); return GRAPH_SUCCESS; } @@ -1104,4 +1186,45 @@ graphStatus OpDesc::CallInferFormatFunc(Operator &op) { } return (graphStatus)infer_format_func_(op); } + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY std::string OpDesc::GetSubgraphInstanceName(uint32_t index) const { + if (static_cast(index) >= subgraph_instance_names_.size()) { + return ""; + } + return subgraph_instance_names_.at(index); +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY const std::vector &OpDesc::GetSubgraphInstanceNames() + const { + return subgraph_instance_names_; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void OpDesc::AddSubgraphInstanceName(std::string name) { + subgraph_instance_names_.emplace_back(std::move(name)); +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void OpDesc::RemoveSubgraphInstanceName(const std::string &name) { + for (auto iter = subgraph_instance_names_.begin(); iter != subgraph_instance_names_.end(); ++iter) { + if (*iter == name) { + subgraph_instance_names_.erase(iter); + return; + } + } +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDesc::AddSubgraphName(const std::string &name) { + auto iter = subgraph_names_to_index_.find(name); + if (iter != subgraph_names_to_index_.end()) { + GELOGW("The subgraph name %s exists, index %u", name.c_str(), iter->second); + return GRAPH_FAILED; + } + auto size = subgraph_names_to_index_.size(); + subgraph_names_to_index_[name] = size; + return GRAPH_SUCCESS; +} + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY const std::map &OpDesc::GetSubgraphNameIndexes() + const { + return subgraph_names_to_index_; +} } // namespace ge diff --git a/src/common/graph/op_imp.cc b/src/common/graph/op_imp.cc index 02048bad..9abf242b 100644 --- a/src/common/graph/op_imp.cc +++ b/src/common/graph/op_imp.cc @@ -20,8 +20,7 @@ #include "debug/ge_log.h" #include "debug/ge_util.h" -using std::function; -using std::vector; +using namespace std; namespace ge { diff --git a/src/common/graph/operator.cc b/src/common/graph/operator.cc index 7bfedad9..6d372297 100644 --- a/src/common/graph/operator.cc +++ b/src/common/graph/operator.cc @@ -15,13 +15,12 @@ */ #include "external/graph/operator.h" - #include #include #include #include #include - +#include "array_ops.h" #include "debug/ge_log.h" #include "debug/ge_op_types.h" #include "debug/ge_util.h" @@ -33,7 +32,6 @@ #include "graph/ge_tensor.h" #include "graph/node.h" #include "graph/op_desc.h" -#include "graph/operator_factory.h" #include "graph/usr_types.h" #include "utils/graph_utils.h" #include "utils/op_desc_utils.h" @@ -48,10 +46,6 @@ using std::string; using std::to_string; using std::vector; -namespace { -const char *const kValue = "value"; -} // namespace - namespace ge { class OpIO { public: @@ -148,6 +142,7 @@ class OperatorImpl : public std::enable_shared_from_this { for (int i = static_cast(is_input_const.size()); i <= dst_index; ++i) { is_input_const.push_back(false); } + is_input_const[dst_index] = is_const; op_desc_->SetIsInputConst(is_input_const); @@ -179,8 +174,8 @@ class OperatorImpl : public std::enable_shared_from_this { GE_CHK_BOOL_EXEC(dst_index >= 0, return, "Find input index by name failed. name[%s], op name:%s", dst_name.c_str(), op_desc_->GetName().c_str()); auto out_op_impl = out_handler->GetOwner(); - GE_CHK_BOOL_EXEC(out_op_impl && out_op_impl->GetOpDescImpl(), return, "out_handler invalid. name[%s]", - dst_name.c_str()); + GE_CHK_BOOL_EXEC(out_op_impl != nullptr && out_op_impl->GetOpDescImpl() != nullptr, return, + "out_handler invalid. name[%s]", dst_name.c_str()); bool is_const = false; if (out_op_impl->GetOpDescImpl()->GetType() == CONSTANT) { is_const = true; @@ -193,7 +188,7 @@ class OperatorImpl : public std::enable_shared_from_this { op_desc_->SetIsInputConst(is_input_const); OpIO in_handler(dst_name, dst_index, shared_from_this()); - GE_CHK_BOOL_EXEC(!!out_op_impl, return, "Get out_handler's impl failed."); + GE_CHK_BOOL_EXEC(out_op_impl != nullptr, return, "Get out_handler's impl failed."); out_op_impl->UpdateLinkMapImpl(src_name, in_handler); auto src_output_desc = out_op_impl->GetOutputDesc(src_name); @@ -210,7 +205,7 @@ class OperatorImpl : public std::enable_shared_from_this { void AddControlInputImp(const ge::Operator &src_oprt) { if (src_oprt.operator_impl_ == nullptr) { - GELOGE(GRAPH_FAILED, "Src operator impl is nullptr"); + GELOGE(FAILED, "Src operator impl is nullptr"); return; } for (auto &input : control_input_link_) { @@ -520,9 +515,9 @@ graphStatus Operator::GetInputConstData(const string &dst_name, Tensor &data) co if (peer_node_ptr->GetOpDesc() != nullptr) { const auto &op_descType = peer_node_ptr->GetOpDesc()->GetType(); if (op_descType == CONSTANTOP) { - return const_op.GetAttr(kValue, data); + return const_op.GetAttr(op::Constant::name_attr_value(), data); } else if (op_descType == CONSTANT) { - return const_op.GetAttr(kValue, data); + return const_op.GetAttr(op::Const::name_attr_value(), data); } } } else { @@ -542,9 +537,9 @@ graphStatus Operator::GetInputConstDataOut(const string &dst_name, Tensor &data) Operator const_op(out_handle.GetOwner()); const auto &op_desc_impl_type = out_handle.GetOwner()->GetOpDescImpl()->GetType(); if (op_desc_impl_type == CONSTANTOP) { - return const_op.GetAttr(kValue, data); + return const_op.GetAttr(op::Constant::name_attr_value(), data); } else if (op_desc_impl_type == CONSTANT) { - return const_op.GetAttr(kValue, data); + return const_op.GetAttr(op::Const::name_attr_value(), data); } } return GRAPH_FAILED; @@ -709,6 +704,7 @@ void Operator::InputRegister(const string &name) { void Operator::OptionalInputRegister(const string &name) { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return, "GetOpDescImpl is nullptr."); + // [No need to verify return value] (void)operator_impl_->GetOpDescImpl()->AddOptionalInputDesc(name, GeTensorDesc(GeShape(), FORMAT_RESERVED, DT_UNDEFINED)); } @@ -716,24 +712,28 @@ void Operator::OptionalInputRegister(const string &name) { void Operator::InferFuncRegister(const std::function &func) { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return, "GetOpDescImpl is nullptr."); + // [No need to verify return value] (void)operator_impl_->GetOpDescImpl()->AddInferFunc(func); } void Operator::InferFormatFuncRegister(const std::function &func) { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return, "GetOpDescImpl is nullptr."); + // [No need to verify return value] (void)operator_impl_->GetOpDescImpl()->AddInferFormatFunc(func); } void Operator::VerifierFuncRegister(const std::function &func) { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return, "GetOpDescImpl is nullptr."); + // [No need to verify return value] (void)operator_impl_->GetOpDescImpl()->AddVerifierFunc(func); } void Operator::OutputRegister(const string &name) { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return, "GetOpDescImpl is nullptr."); + // [No need to verify return value] (void)operator_impl_->GetOpDescImpl()->AddOutputDesc(name, GeTensorDesc()); } @@ -757,7 +757,8 @@ int Operator::GetDynamicInputNum(const string &name) const { void Operator::DynamicOutputRegister(const string &name, const unsigned int num, bool is_push_back) { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return, "GetOpDescImpl is nullptr."); - (void)AttrUtils::SetInt(operator_impl_->GetOpDescImpl(), DYNAMIC_OUTPUT_TD_NUM(name), num); + GE_CHK_BOOL_EXEC(AttrUtils::SetInt(operator_impl_->GetOpDescImpl(), DYNAMIC_OUTPUT_TD_NUM(name), num), return, + "Set %s int failed", name.c_str()); (void)operator_impl_->GetOpDescImpl()->AddDynamicOutputDesc(name, num, is_push_back); } @@ -765,7 +766,8 @@ int Operator::GetDynamicOutputNum(const string &name) const { GE_CHK_BOOL_EXEC(operator_impl_ != nullptr, return 0, "operator impl is nullptr."); GE_CHK_BOOL_EXEC(operator_impl_->GetOpDescImpl() != nullptr, return 0, "GetOpDescImpl is nullptr."); int num = 0; - (void)AttrUtils::GetInt(operator_impl_->GetOpDescImpl(), DYNAMIC_INPUT_TD_NUM(name), num); + GE_CHK_BOOL_EXEC(AttrUtils::GetInt(operator_impl_->GetOpDescImpl(), DYNAMIC_OUTPUT_TD_NUM(name), num), return num, + "Get %s int failed", name.c_str()); return num; } @@ -1141,7 +1143,9 @@ class GraphBuilderImpl { GELOGW("Input operator should be Data, Variable operator or operator that has output but no input."); } } - + GE_CHK_BOOL_EXEC(!vec_inputs.empty(), return nullptr, + "User Input do not include operator such as \ + Data, Variable operator or operator that has output but no input."); auto ret = WalkAllOperators(vec_inputs); GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return nullptr, "WalkAllOperators failed."); @@ -1163,7 +1167,8 @@ class GraphBuilderImpl { que.pop(); for (const auto &op_impl : vec_tem) { GE_CHK_BOOL_EXEC(op_impl != nullptr, return GRAPH_FAILED, "Operator Impl is null.") - GE_CHK_BOOL_EXEC_INFO(all_nodes_info_.find(op_impl) == all_nodes_info_.end(), continue) + GE_CHK_BOOL_EXEC_INFO(all_nodes_info_.find(op_impl) == all_nodes_info_.end(), continue, + "This node %s has created.", op_impl->GetName().c_str()) auto node_ptr = graph_->AddNode(op_impl->op_desc_); GE_CHK_BOOL_EXEC(node_ptr != nullptr, return GRAPH_FAILED, "Add node failed."); all_nodes_info_.insert(std::make_pair(op_impl, node_ptr)); @@ -1202,10 +1207,13 @@ class GraphBuilderImpl { for (const auto &node_info : all_nodes_info_) { auto src_op_impl_ptr = node_info.first; auto src_node_ptr = node_info.second; + GE_IF_BOOL_EXEC(src_op_impl_ptr == nullptr || src_node_ptr == nullptr, continue); auto out_links = src_op_impl_ptr->output_links_; + GE_CHK_BOOL_EXEC(src_op_impl_ptr->op_desc_ != nullptr, return GRAPH_FAILED, + "Src operator impl's op_desc is null."); auto &op_desc = src_op_impl_ptr->op_desc_; - + GE_IF_BOOL_EXEC(op_desc == nullptr, continue); for (const auto &out : out_links) { auto src_idx = op_desc->GetOutputIndexByName(out.first); GE_CHK_BOOL_EXEC(src_idx >= 0, return GRAPH_FAILED, "Find output index by name failed"); @@ -1216,7 +1224,9 @@ class GraphBuilderImpl { for (const auto &dst_opio : out.second) { auto dst_node_info = all_nodes_info_.find(dst_opio.GetOwner()); GE_CHK_BOOL_EXEC(dst_node_info != all_nodes_info_.end(), return GRAPH_FAILED, "Find Dst node failed."); + GE_IF_BOOL_EXEC(dst_node_info->second == nullptr, continue); + auto dst_anchor = dst_node_info->second->GetInDataAnchor(dst_opio.GetIndex()); GE_CHK_BOOL_EXEC(dst_anchor != nullptr, return GRAPH_FAILED, "GetInDataAnchor failed."); @@ -1260,8 +1270,7 @@ inline bool HasSameNameNode(const ComputeGraphPtr &compute_graph) { ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector &inputs) { auto graph_builder_impl = GraphBuilderImpl(name); ComputeGraphPtr compute_graph = graph_builder_impl.BuildGraph(inputs); - GE_IF_BOOL_EXEC(compute_graph == nullptr, return compute_graph); - + GE_CHK_BOOL_EXEC(compute_graph != nullptr, return compute_graph, "Computer graph is nullptr"); compute_graph->SetAllNodesInfo(graph_builder_impl.GetAllNodesInfo()); if (HasSameNameNode(compute_graph)) { GELOGW("Compute do not allow has same name nodes."); diff --git a/src/common/graph/opsproto/opsproto_manager.cc b/src/common/graph/opsproto/opsproto_manager.cc index a5bdb4c5..c2afc191 100644 --- a/src/common/graph/opsproto/opsproto_manager.cc +++ b/src/common/graph/opsproto/opsproto_manager.cc @@ -15,13 +15,11 @@ */ #include "graph/opsproto_manager.h" - -#include #include +#include #include #include #include - #include "debug/ge_util.h" #include "framework/common/debug/ge_log.h" #include "graph/debug/ge_log.h" @@ -155,7 +153,7 @@ void OpsProtoManager::LoadOpsProtoPluginSo(std::string &path) { // Load .so file for (auto elem : file_list) { - void *handle = dlopen(elem.c_str(), RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE); + void *handle = dlopen(elem.c_str(), RTLD_NOW | RTLD_GLOBAL); if (handle == nullptr) { GELOGW("OpsProtoManager dlopen failed, plugin name:%s. Message(%s).", elem.c_str(), dlerror()); continue; diff --git a/src/common/graph/option/ge_context.cc b/src/common/graph/option/ge_context.cc index bbf501c7..f5ebdeee 100644 --- a/src/common/graph/option/ge_context.cc +++ b/src/common/graph/option/ge_context.cc @@ -15,7 +15,6 @@ */ #include "./ge_context.h" - #include "./ge_global_options.h" #include "./ge_local_context.h" #include "framework/common/debug/ge_log.h" @@ -87,4 +86,5 @@ uint32_t GEContext::DeviceId() { return device_id_; } uint64_t GEContext::TraceId() { return trace_id_; } void GEContext::SetCtxDeviceId(uint32_t device_id) { device_id_ = device_id; } + } // namespace ge diff --git a/src/common/graph/shape_refiner.cc b/src/common/graph/shape_refiner.cc index 72cdef02..321786a9 100644 --- a/src/common/graph/shape_refiner.cc +++ b/src/common/graph/shape_refiner.cc @@ -22,6 +22,7 @@ #include #include +#include "graph/utils/graph_utils.h" #include "debug/ge_log.h" #include "debug/ge_op_types.h" #include "external/graph/operator.h" @@ -34,6 +35,122 @@ #include "utils/type_utils.h" namespace ge { +namespace { +constexpr const char *kRefIndex = "parent_node_index"; +graphStatus UpdateSubGraphDataNodes(const ConstNodePtr &node) { + auto op_desc = node->GetOpDesc(); + auto sub_graph_names = op_desc->GetSubgraphInstanceNames(); + if (sub_graph_names.empty()) { + return GRAPH_SUCCESS; + } + + auto root_graph = GraphUtils::FindRootGraph(node->GetOwnerComputeGraph()); + for (const auto &name : sub_graph_names) { + auto sub_graph = root_graph->GetSubgraph(name); + if (sub_graph == nullptr) { + GE_LOGE("Can node find the subgrpah %s for node %s", name.c_str(), node->GetName().c_str()); + return GRAPH_FAILED; + } + for (const auto &node_sub : sub_graph->GetDirectNode()) { + if (node_sub->GetType() != DATA) { + continue; + } + int ref_i; + auto data_opdesc = node_sub->GetOpDesc(); + if (data_opdesc == nullptr) { + GE_LOGE("Invalid data node on the sub graph %s parent node %s, no OpDesc", name.c_str(), + node->GetName().c_str()); + return GRAPH_FAILED; + } + if (!AttrUtils::GetInt(node_sub->GetOpDesc(), kRefIndex, ref_i)) { + GE_LOGE("Invalid data node on the sub graph %s parent node %s, no ref-index attribute", name.c_str(), + node->GetName().c_str()); + return GRAPH_FAILED; + } + auto input_desc = op_desc->MutableInputDesc(ref_i); + if (input_desc == nullptr) { + GE_LOGE( + "The ref index(%d) on the data %s on the sub graph %s " + "parent node %s are incompatible, inputs num %u", + ref_i, node_sub->GetName().c_str(), name.c_str(), node->GetName().c_str(), node->GetAllOutDataAnchorsSize()); + return GRAPH_FAILED; + } + auto ret = data_opdesc->UpdateInputDesc(0, *input_desc); + if (ret != GRAPH_SUCCESS) { + GE_LOGE("Failed to update input desc of data %s on the sub graph %s parent node %s", + node_sub->GetName().c_str(), name.c_str(), node->GetName().c_str()); + return ret; + } + ret = data_opdesc->UpdateOutputDesc(0, *input_desc); + if (ret != GRAPH_SUCCESS) { + GE_LOGE("Failed to update output desc of data %s on the sub graph %s parent node %s", + node_sub->GetName().c_str(), name.c_str(), node->GetName().c_str()); + return ret; + } + } + } + return GRAPH_SUCCESS; +} +graphStatus UpdateParentNodeOutTensor(const ConstNodePtr &node) { + auto op_desc = node->GetOpDesc(); + auto sub_graph_names = op_desc->GetSubgraphInstanceNames(); + if (sub_graph_names.empty()) { + return GRAPH_SUCCESS; + } + + auto root_graph = GraphUtils::FindRootGraph(node->GetOwnerComputeGraph()); + for (const auto &name : sub_graph_names) { + auto sub_graph = root_graph->GetSubgraph(name); + if (sub_graph == nullptr) { + GE_LOGE("Can node find the subgrpah %s for node %s", name.c_str(), node->GetName().c_str()); + return GRAPH_FAILED; + } + NodePtr netoutput = nullptr; + auto sub_nodes = sub_graph->GetDirectNode(); + for (size_t i = sub_nodes.size(); i > 0; --i) { + auto sub_node = sub_nodes.at(i - 1); + if (sub_node->GetType() == NETOUTPUT) { + netoutput = sub_node; + break; + } + } + if (netoutput == nullptr) { + GE_LOGE("No NetOutput node on sub graph %s, parent node %s", name.c_str(), node->GetName().c_str()); + return GRAPH_FAILED; + } + auto netoutput_opdesc = netoutput->GetOpDesc(); + if (netoutput_opdesc == nullptr) { + GE_LOGE("Invalid NetOutput node on sub graph %s, parent node %s, no OpDesc on it", name.c_str(), + node->GetName().c_str()); + return GRAPH_FAILED; + } + for (auto &edge_anchor : netoutput->GetAllInDataAnchors()) { + auto edge_desc = netoutput_opdesc->MutableInputDesc(edge_anchor->GetIdx()); + if (edge_desc == nullptr) { + GE_LOGE("Invalid NetOutput node on sub graph %s, parent node %s, can not find input tensor %d", name.c_str(), + node->GetName().c_str(), edge_anchor->GetIdx()); + return GRAPH_FAILED; + } + int ref_i; + if (!AttrUtils::GetInt(edge_desc, kRefIndex, ref_i)) { + // if there is no ref index on the TensorDesc, it means the output data will be ignored outer. + continue; + } + auto output_desc = op_desc->MutableOutputDesc(static_cast(ref_i)); + if (output_desc == nullptr) { + GE_LOGE( + "The ref index(%d) on the input %d of netoutput %s on the sub graph %s " + "parent node %s are incompatible, outputs num %u", + ref_i, edge_anchor->GetIdx(), netoutput->GetName().c_str(), name.c_str(), node->GetName().c_str(), + node->GetAllOutDataAnchorsSize()); + return GRAPH_FAILED; + } + op_desc->UpdateOutputDesc(edge_anchor->GetIdx(), *edge_desc); + } + } + return GRAPH_SUCCESS; +} +} // namespace void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::string &phase) { if (node == nullptr) { GELOGE(GRAPH_FAILED, "node is null"); @@ -42,7 +159,7 @@ void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::str ge::OpDescPtr op_desc = node->GetOpDesc(); GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(GRAPH_FAILED, "op_desc is null."); return ); std::string str; - if (!op_desc->GetAllInputsDescPtr().empty()) { + if (op_desc->GetInputsSize() != 0) { std::string input_desc_str = "input shape: "; for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) { input_desc_str += "["; @@ -56,7 +173,7 @@ void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::str str += input_desc_str; } - if (!op_desc->GetAllOutputsDescPtr().empty()) { + if (op_desc->GetAllOutputsDescSize() != 0) { std::string output_desc_str = "output shape: "; for (const auto &output_desc : op_desc->GetAllOutputsDescPtr()) { if (output_desc == nullptr) { @@ -76,13 +193,24 @@ void ShapeRefiner::PrintInOutTensorShape(const ge::NodePtr &node, const std::str } graphStatus ShapeRefiner::InferShapeAndType(const ConstNodePtr &node, Operator &op) { + return InferShapeAndType(node, op, true); +} +graphStatus ShapeRefiner::InferShapeAndType(const ConstNodePtr &node, Operator &op, bool before_subgraph) { GE_IF_BOOL_EXEC(node == nullptr, GELOGE(GRAPH_FAILED, "node is null."); return GRAPH_FAILED); auto op_desc = node->GetOpDesc(); GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(GRAPH_FAILED, "op_desc is null."); return GRAPH_FAILED); const auto &op_type = op_desc->GetType(); + graphStatus ret; + if (before_subgraph) { + ret = UpdateSubGraphDataNodes(node); + if (ret != GRAPH_SUCCESS) { + return ret; + } + } + // Get infer func and execute - graphStatus ret = op_desc->CallInferFunc(op); + ret = op_desc->CallInferFunc(op); if (ret == GRAPH_PARAM_INVALID) { // Op ir no infer func, try to get infer func from operator factory auto node_op = ge::OperatorFactory::CreateOperator("node_op", op_desc->GetType()); @@ -113,7 +241,14 @@ graphStatus ShapeRefiner::InferShapeAndType(const ConstNodePtr &node, Operator & ret = op_desc->CallInferFunc(op); GELOGI("op CallInferFunc second. ret: %u", ret); } - return ret; + if (ret != GRAPH_SUCCESS) { + return ret; + } + + if (!before_subgraph) { + return UpdateParentNodeOutTensor(node); + } + return GRAPH_SUCCESS; } InferenceContextPtr CreateInferenceContext(const std::unordered_map &context_map, @@ -179,8 +314,11 @@ InferenceContextPtr CreateInferenceContext(const std::unordered_map context_map; } - GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ShapeRefiner::InferShapeAndType(const NodePtr &node) { + return InferShapeAndType(node, true); +} +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ShapeRefiner::InferShapeAndType(const NodePtr &node, + bool before_subgraph) { GE_IF_BOOL_EXEC(node == nullptr, GELOGE(GRAPH_FAILED, "node is null."); return GRAPH_FAILED); if (node->Verify() != GRAPH_SUCCESS) { GELOGE(GRAPH_FAILED, "Verifying %s failed.", node->GetName().c_str()); @@ -199,7 +337,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ShapeRefiner::InferSh Operator op = OpDescUtils::CreateOperatorFromNode(node); op.SetInferenceContext(inference_context); - graphStatus status = InferShapeAndType(node, op); + graphStatus status = InferShapeAndType(node, op, before_subgraph); if (status == GRAPH_PARAM_INVALID || status == GRAPH_SUCCESS) { (void)ge::NodeUtils::UpdatePeerNodeInputDesc(node); } else { diff --git a/src/common/graph/tensor.cc b/src/common/graph/tensor.cc index 0be00988..8e0a9c7d 100644 --- a/src/common/graph/tensor.cc +++ b/src/common/graph/tensor.cc @@ -353,6 +353,7 @@ Tensor::Tensor(const TensorDesc &tensor_desc, const uint8_t *data, size_t size) } } } + impl = ComGraphMakeShared(tensor_desc, data, size); } @@ -516,13 +517,14 @@ graphStatus Tensor::IsValid() { GELOGW("mul overflow: %lu, %u", shape_size, type_length); } else { if (shape_size * type_length != data_size) { - // [Just log] Constructor GELOGW("tensor length not equal: shape_byte_size=%lu, data_size=%zu, dt_type=%s.", shape_size * type_length, data_size, TypeUtils::DataTypeToSerialString(data_type).c_str()); + return GRAPH_FAILED; } } } } + return GRAPH_SUCCESS; } @@ -539,7 +541,7 @@ GeTensorDesc TensorAdapter::TensorDesc2GeTensorDesc(const TensorDesc &tensor_des tensor_desc.GetDataType()); ge_tensor_desc.SetOriginShape(GeShape(tensor_desc.GetOriginShape().GetDims())); ge_tensor_desc.SetOriginFormat(tensor_desc.GetOriginFormat()); - auto size = static_cast(tensor_desc.GetSize()); + auto size = tensor_desc.GetSize(); TensorUtils::SetSize(ge_tensor_desc, size); auto real_dim_cnt = static_cast(tensor_desc.GetRealDimCnt()); @@ -552,7 +554,7 @@ TensorDesc TensorAdapter::GeTensorDesc2TensorDesc(const GeTensorDesc &ge_tensor_ ge_tensor_desc.GetDataType()); tensor_desc.SetOriginShape(Shape(ge_tensor_desc.GetOriginShape().GetDims())); tensor_desc.SetOriginFormat(ge_tensor_desc.GetOriginFormat()); - uint32_t size = 0; + int64_t size = 0; (void)TensorUtils::GetSize(ge_tensor_desc, size); tensor_desc.SetSize(size); diff --git a/src/common/graph/utils/ge_ir_utils.cc b/src/common/graph/utils/ge_ir_utils.cc index 0d22b615..b6367011 100644 --- a/src/common/graph/utils/ge_ir_utils.cc +++ b/src/common/graph/utils/ge_ir_utils.cc @@ -15,18 +15,21 @@ */ #include "graph/utils/ge_ir_utils.h" - #include - #include "framework/common/debug/ge_log.h" namespace { const char *const kControlAnchorIndex = ":-1"; const char *const kNodeTypeForSubgraph = "subgraph"; +const char *const kPrefixForInputDesc = "input_desc_attr_"; +const char *const kPrefixForOutputDesc = "output_desc_attr_"; const char *const kDumpGEGraph = "DUMP_GE_GRAPH"; const int8_t kMaxRecursionDepth = 10; const char *const kDumpGeGraph = std::getenv(kDumpGEGraph); const int64_t kDumpLevel = (kDumpGeGraph != nullptr) ? std::strtol(kDumpGeGraph, nullptr, 10) : ge::OnnxUtils::NO_DUMP; +const int64_t kInputPrefixLength = 5; +const int64_t kOutputPrefixLength = 6; +using AttrDefPair = ::google::protobuf::MapPair; } // namespace namespace ge { @@ -198,7 +201,7 @@ void OnnxUtils::AddAttrProto(onnx::NodeProto *node_proto, onnx::AttributeProto_A void OnnxUtils::AddAttrProto(onnx::NodeProto *node_proto, onnx::AttributeProto_AttributeType type, const string &name, ::google::protobuf::RepeatedField data) { if (node_proto == nullptr) { - GELOGE(FAILED, "Node_proto %s is nullptr.", name.c_str()); + GELOGE(FAILED, "Node proto %s is nullptr.", name.c_str()); return; } if (!data.empty()) { @@ -320,7 +323,16 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const auto cmps_tab_offset = tensor_descriptor->cmps_tab_offset(); AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "input_desc_cmps_tab_offset:" + std::to_string(i), &cmps_tab_offset); + const auto &tensor_desc_map = tensor_descriptor->attr(); + std::string suffix = ":" + std::to_string(i); + AddAttrProtoForAttrsFromAttrMap(tensor_desc_map, node_proto, kPrefixForInputDesc, suffix); + } else { + GELOGW("Tensor descriptor is nullptr"); + continue; } + } else { + GELOGW("Input desc is nullptr"); + continue; } } } @@ -360,16 +372,25 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const auto real_dim_cnt = tensor_descriptor->real_dim_cnt(); AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, "output_desc_real_dim_cnt:" + std::to_string(i), &real_dim_cnt); + const auto &tensor_desc_map = tensor_descriptor->attr(); + std::string suffix = ":" + std::to_string(i); + AddAttrProtoForAttrsFromAttrMap(tensor_desc_map, node_proto, kPrefixForOutputDesc, suffix); + } else { + GELOGW("Tensor descriptor is nullptr"); + continue; } + } else { + GELOGW("Output desc is nullptr"); + continue; } } } } -void OnnxUtils::AddAttrProtoForAttrsFromOpDef(const ge::proto::OpDef *op_def, onnx::NodeProto *node_proto) { - GE_CHK_BOOL_EXEC(op_def != nullptr, return, "Opdef is nullptr"); - const auto &op_def_attr_map = op_def->attr(); - for (const auto &item : op_def_attr_map) { +void OnnxUtils::AddAttrProtoForAttrsFromAttrMap( + const ::google::protobuf::Map &attr_map, onnx::NodeProto *node_proto, + const std::string &prefix, const std::string &suffix) { + for (const auto &item : attr_map) { auto attr_name = item.first; auto attr_def = item.second; auto attr_type = attr_def.value_case(); @@ -377,36 +398,40 @@ void OnnxUtils::AddAttrProtoForAttrsFromOpDef(const ge::proto::OpDef *op_def, on const auto &tensor_def = attr_def.t(); const auto &tensor_desc = tensor_def.desc(); auto data_type = ge::proto::DataType_Name(tensor_desc.dtype()); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, attr_name + "_desc_dtype:", &data_type); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, prefix + attr_name + "_desc_dtype" + suffix, + &data_type); auto dims = tensor_desc.shape().dim(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, attr_name + "_desc_shape:", dims); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, prefix + attr_name + "_desc_shape" + suffix, + dims); auto layout = tensor_desc.layout(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, attr_name + "_desc_layout:", &layout); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, prefix + attr_name + "_desc_layout" + suffix, + &layout); auto device_type = tensor_desc.device_type(); AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, - attr_name + "_desc_device_type:", &device_type); + prefix + attr_name + "_desc_device_type" + suffix, &device_type); if (kDumpLevel == DUMP_ALL) { auto data = tensor_def.data(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, attr_name + "_data", &data); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, prefix + attr_name + "_data" + suffix, + &data); } } if (attr_type == ge::proto::AttrDef::kS) { if (kDumpLevel == DUMP_ALL) { auto str_value = attr_def.s(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, attr_name, &str_value); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, prefix + attr_name + suffix, &str_value); } } if (attr_type == ge::proto::AttrDef::kI) { auto int_value = attr_def.i(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, attr_name, &int_value); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, prefix + attr_name + suffix, &int_value); } if (attr_type == ge::proto::AttrDef::kF) { auto float_value = attr_def.f(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_FLOAT, attr_name, &float_value); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_FLOAT, prefix + attr_name + suffix, &float_value); } if (attr_type == ge::proto::AttrDef::kB) { auto int_value = static_cast(attr_def.b()); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, attr_name, &int_value); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INT, prefix + attr_name + suffix, &int_value); } if (attr_type == ge::proto::AttrDef::kList) { const auto &list_value = attr_def.list(); @@ -415,21 +440,21 @@ void OnnxUtils::AddAttrProtoForAttrsFromOpDef(const ge::proto::OpDef *op_def, on ge::proto::AttrDef_ListValue_ListValueType::AttrDef_ListValue_ListValueType_VT_LIST_STRING) { if (kDumpLevel == DUMP_ALL) { const auto &strings = list_value.s(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRINGS, attr_name, strings); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRINGS, prefix + attr_name + suffix, strings); } } if (list_value_type == ge::proto::AttrDef_ListValue_ListValueType::AttrDef_ListValue_ListValueType_VT_LIST_FLOAT) { const auto &floats = list_value.f(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_FLOATS, attr_name, floats); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_FLOATS, prefix + attr_name + suffix, floats); } if (list_value_type == ge::proto::AttrDef_ListValue_ListValueType::AttrDef_ListValue_ListValueType_VT_LIST_INT) { const auto &ints = list_value.i(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, attr_name, ints); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, prefix + attr_name + suffix, ints); } if (list_value_type == ge::proto::AttrDef_ListValue_ListValueType::AttrDef_ListValue_ListValueType_VT_LIST_BOOL) { const auto &bools = list_value.b(); - AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, attr_name, bools); + AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, prefix + attr_name + suffix, bools); } } } @@ -481,8 +506,15 @@ void OnnxUtils::AddAttrProtoFromNodeMembers(const NodePtr &node, onnx::NodeProto AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "workspace_bytes", workspace_bytes); const auto &is_input_const = op_def->is_input_const(); AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "is_input_const", is_input_const); - AddAttrProtoForAttrsFromOpDef(op_def, node_proto); + const auto &op_def_attr_map = op_def->attr(); + AddAttrProtoForAttrsFromAttrMap(op_def_attr_map, node_proto); + } else { + GELOGE(FAILED, "Opdef is nullptr"); + return; } + } else { + GELOGE(FAILED, "Opdesc is nullptr"); + return; } } @@ -526,15 +558,13 @@ bool OnnxUtils::EncodeNodeLink(const NodePtr &node, onnx::NodeProto *node_proto) node_proto->clear_input(); // 1. Add input by in data edge for (const auto &in_data_anchor : node->GetAllInDataAnchors()) { - if (in_data_anchor != nullptr) { - auto peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); - if ((peer_out_anchor != nullptr) && (peer_out_anchor->GetOwnerNode() != nullptr)) { - node_proto->add_input(peer_out_anchor->GetOwnerNode()->GetName() + ":" + - std::to_string(peer_out_anchor->GetIdx())); - } else { - // Add "" input - node_proto->add_input(""); - } + auto peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + if ((peer_out_anchor != nullptr) && (peer_out_anchor->GetOwnerNode() != nullptr)) { + node_proto->add_input(peer_out_anchor->GetOwnerNode()->GetName() + ":" + + std::to_string(peer_out_anchor->GetIdx())); + } else { + // Add "" input + node_proto->add_input(""); } } @@ -547,6 +577,9 @@ bool OnnxUtils::EncodeNodeLink(const NodePtr &node, onnx::NodeProto *node_proto) node_proto->add_input(peer_out_anchor->GetOwnerNode()->GetName() + kControlAnchorIndex); } } + } else { + GELOGE(FAILED, "Incontrol anchor is nullptr"); + return false; } // 3. Add output for Netron visual support @@ -584,7 +617,7 @@ void OnnxUtils::EncodeTypeProtoTensorType(const NodePtr &node, onnx::TypeProto_T } const auto &op_desc = node->GetOpDesc(); if (op_desc != nullptr) { - auto size_out = op_desc->GetOutputsSize(); + uint32_t size_out = static_cast(op_desc->GetOutputsSize()); if (size_out > 0) { for (uint32_t i = 0; i < size_out; i++) { const ConstGeTensorDescPtr &ge_tensor = op_desc->GetOutputDescPtr(i); @@ -598,7 +631,13 @@ void OnnxUtils::EncodeTypeProtoTensorType(const NodePtr &node, onnx::TypeProto_T auto dim = shape->add_dim(); dim->set_dim_value(d); } + } else { + GELOGW("Shape is nullptr"); + continue; } + } else { + GELOGW("Ge tensor is nullptr"); + continue; } } } @@ -666,7 +705,7 @@ bool OnnxUtils::ConvertGeModelToModelProto(const ge::Model &model, onnx::ModelPr } // For subgraphs: a subgraph is represented by a node - for (const auto &sub_compute_graph : compute_graph->sub_graph_) { + for (const auto &sub_compute_graph : compute_graph->GetAllSubgraphs()) { if (sub_compute_graph != nullptr) { auto node_proto = graph_proto->add_node(); if (node_proto == nullptr) { @@ -679,6 +718,10 @@ bool OnnxUtils::ConvertGeModelToModelProto(const ge::Model &model, onnx::ModelPr attr->set_name("graph"); attr->set_type(onnx::AttributeProto_AttributeType_GRAPH); auto sub_graph_proto = attr->mutable_g(); + if (sub_graph_proto == nullptr) { + GELOGW("Sub graph proto is nullptr"); + continue; + } if (!EncodeGraph(sub_compute_graph, sub_graph_proto)) { GELOGW("Encode sub graph: %s fail", sub_compute_graph->GetName().c_str()); continue; @@ -831,56 +874,116 @@ void OnnxUtils::DecodeAttribute(const onnx::AttributeProto &attr_proto, int64_t value = attr_proto.i(); } -void OnnxUtils::DecodeNodeAttributeForOpInAndOutDesc(const onnx::AttributeProto &attr_proto, - const std::string &attr_name_for_input_output_desc, int32_t index, - OpDescPtr &op_desc) { - if (op_desc == nullptr || op_desc->MutableInputDesc(static_cast(index)) == nullptr) { - GELOGE(GRAPH_FAILED, "op_desc or op_desc->MutableInputDesc(index) is nullptr"); +void OnnxUtils::DecodeNodeAttributeForOpInDesc(const onnx::AttributeProto &attr_proto, + const std::string &attr_name_for_input_desc, int32_t index, + OpDescPtr &op_desc) { + if (op_desc->MutableInputDesc(static_cast(index)) == nullptr) { + GELOGE(GRAPH_FAILED, "[op name %s,attr name %s]op_desc->MutableInputDesc(static_cast(index)) is nullptr", + op_desc->GetName().c_str(), attr_name_for_input_desc.c_str()); return; } - if (attr_name_for_input_output_desc == "input_desc_dtype") { + if (attr_name_for_input_desc == "input_desc_dtype") { auto data_type = TypeUtils::SerialStringToDataType(attr_proto.s()); op_desc->MutableInputDesc(static_cast(index))->SetDataType(data_type); - } else if (attr_name_for_input_output_desc == "input_desc_shape") { + } else if (attr_name_for_input_desc == "input_desc_shape") { std::vector ints; DecodeAttribute(attr_proto, ints); GeShape ge_shape(ints); op_desc->MutableInputDesc(static_cast(index))->SetShape(ge_shape); - } else if (attr_name_for_input_output_desc == "input_desc_layout") { + } else if (attr_name_for_input_desc == "input_desc_layout") { auto data_format = TypeUtils::SerialStringToFormat(attr_proto.s()); op_desc->MutableInputDesc(static_cast(index))->SetFormat(data_format); - } else if (attr_name_for_input_output_desc == "input_desc_origin_shape") { + } else if (attr_name_for_input_desc == "input_desc_origin_shape") { std::vector ints; DecodeAttribute(attr_proto, ints); GeShape ge_shape(ints); op_desc->MutableInputDesc(static_cast(index))->SetOriginShape(ge_shape); - } else if (attr_name_for_input_output_desc == "input_desc_origin_layout") { + } else if (attr_name_for_input_desc == "input_desc_origin_layout") { auto data_format = TypeUtils::SerialStringToFormat(attr_proto.s()); op_desc->MutableInputDesc(static_cast(index))->SetOriginFormat(data_format); - } else if (attr_name_for_input_output_desc == "output_desc_dtype") { + } else if (attr_name_for_input_desc == "input_desc_size") { + int64_t input_size = 0; + auto tensor_descriptor = op_desc->MutableInputDesc(static_cast(index))->tensor_descriptor_.GetProtoMsg(); + DecodeAttribute(attr_proto, input_size); + tensor_descriptor->set_size(input_size); + } else if (attr_name_for_input_desc == "input_desc_data_offset") { + auto tensor_descriptor = op_desc->MutableInputDesc(static_cast(index))->tensor_descriptor_.GetProtoMsg(); + int64_t offset = 0; + DecodeAttribute(attr_proto, offset); + tensor_descriptor->set_data_offset(offset); + } else { + return; + } +} + +void OnnxUtils::DecodeNodeAttributeForOpOutDesc(const onnx::AttributeProto &attr_proto, + const std::string &attr_name_for_output_desc, int32_t index, + OpDescPtr &op_desc) { + if (op_desc->MutableOutputDesc(static_cast(index)) == nullptr) { + GELOGE(GRAPH_FAILED, "[op name %s,attr name %s]op_desc->MutableOutputDesc(static_cast(index)) is nullptr", + op_desc->GetName().c_str(), attr_name_for_output_desc.c_str()); + return; + } + if (attr_name_for_output_desc == "output_desc_dtype") { auto data_type = TypeUtils::SerialStringToDataType(attr_proto.s()); op_desc->MutableOutputDesc(static_cast(index))->SetDataType(data_type); - } else if (attr_name_for_input_output_desc == "output_desc_shape") { + } else if (attr_name_for_output_desc == "output_desc_shape") { std::vector ints; DecodeAttribute(attr_proto, ints); GeShape ge_shape(ints); op_desc->MutableOutputDesc(static_cast(index))->SetShape(ge_shape); - } else if (attr_name_for_input_output_desc == "output_desc_layout") { + } else if (attr_name_for_output_desc == "output_desc_layout") { auto data_format = TypeUtils::SerialStringToFormat(attr_proto.s()); op_desc->MutableOutputDesc(static_cast(index))->SetFormat(data_format); - } else if (attr_name_for_input_output_desc == "output_desc_origin_shape") { + } else if (attr_name_for_output_desc == "output_desc_origin_shape") { std::vector ints; DecodeAttribute(attr_proto, ints); GeShape ge_shape(ints); op_desc->MutableOutputDesc(static_cast(index))->SetOriginShape(ge_shape); - } else if (attr_name_for_input_output_desc == "output_desc_origin_layout") { + } else if (attr_name_for_output_desc == "output_desc_origin_layout") { auto data_format = TypeUtils::SerialStringToFormat(attr_proto.s()); op_desc->MutableOutputDesc(static_cast(index))->SetOriginFormat(data_format); + } else if (attr_name_for_output_desc == "output_desc_size") { + int64_t output_size = 0; + auto tensor_descriptor = op_desc->MutableOutputDesc(static_cast(index))->tensor_descriptor_.GetProtoMsg(); + DecodeAttribute(attr_proto, output_size); + tensor_descriptor->set_size(output_size); + } else if (attr_name_for_output_desc == "output_desc_data_offset") { + auto tensor_descriptor = op_desc->MutableOutputDesc(static_cast(index))->tensor_descriptor_.GetProtoMsg(); + int64_t offset = 0; + DecodeAttribute(attr_proto, offset); + tensor_descriptor->set_data_offset(offset); + } else { + return; + } +} + +void OnnxUtils::DecodeNodeAttributeForOpInAndOutDesc(const onnx::AttributeProto &attr_proto, + const std::string &attr_name_for_input_output_desc, int32_t index, + OpDescPtr &op_desc) { + if (op_desc == nullptr) { + GELOGE(GRAPH_FAILED, "op_desc is nullptr"); + return; + } + if (attr_name_for_input_output_desc.substr(0, kInputPrefixLength) == "input") { + DecodeNodeAttributeForOpInDesc(attr_proto, attr_name_for_input_output_desc, index, op_desc); + } else if (attr_name_for_input_output_desc.substr(0, kOutputPrefixLength) == "output") { + DecodeNodeAttributeForOpOutDesc(attr_proto, attr_name_for_input_output_desc, index, op_desc); } else { return; } } +void OnnxUtils::DecodeNodeAttributeForOpDef(const onnx::AttributeProto &attr_proto, ge::proto::OpDef &op_def) { + auto attr_map = op_def.mutable_attr(); + const auto &attr_name = attr_proto.name(); + ge::proto::AttrDef op_attr; + int64_t value = 0; + DecodeAttribute(attr_proto, value); + op_attr.set_i(value); + attr_map->insert(AttrDefPair(attr_name, op_attr)); +} + void OnnxUtils::DecodeNodeAttributeForOpDesc(const onnx::AttributeProto &attr_proto, OpDescPtr &op_desc) { if (op_desc == nullptr) { GELOGE(GRAPH_FAILED, "DecodeNodeAttributeForOpDesc: op_desc is nullptr"); @@ -910,6 +1013,16 @@ void OnnxUtils::DecodeNodeAttributeForOpDesc(const onnx::AttributeProto &attr_pr std::vector ints; DecodeAttribute(attr_proto, ints); op_desc->SetDstIndex(ints); + } else if (attr_name == "fusion_scope") { + DecodeNodeAttributeForOpDef(attr_proto, *op_desc->op_def_.GetProtoMsg()); + } else if (attr_name == "input_i") { + std::vector ints; + DecodeAttribute(attr_proto, ints); + op_desc->SetInputOffset(ints); + } else if (attr_name == "output_i") { + std::vector ints; + DecodeAttribute(attr_proto, ints); + op_desc->SetOutputOffset(ints); } else { return; } @@ -939,20 +1052,14 @@ bool OnnxUtils::DecodeNodeDesc(const onnx::NodeProto *node_proto, OpDescPtr &op_ auto size_in = attr.i(); for (int64_t i = 0; i < size_in; i++) { GeTensorDesc ge_tensor_desc; - if (op_desc->AddInputDesc(ge_tensor_desc) != GRAPH_SUCCESS) { - GELOGW("Add inputdesc failed"); - continue; - } + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(ge_tensor_desc) == GRAPH_SUCCESS, continue, "Add inputdesc failed."); } } if (attr.name() == "output_desc_nums") { auto size_out = attr.i(); for (int64_t i = 0; i < size_out; i++) { GeTensorDesc ge_tensor_desc; - if (op_desc->AddInputDesc(ge_tensor_desc) != GRAPH_SUCCESS) { - GELOGW("add inputdesc failed"); - continue; - } + GE_CHK_BOOL_EXEC(op_desc->AddOutputDesc(ge_tensor_desc) == GRAPH_SUCCESS, continue, "Add outputdesc failed."); } } } @@ -970,10 +1077,7 @@ bool OnnxUtils::DecodeGraph(int recursion_depth, const onnx::GraphProto &graph_p } graph = ComGraphMakeShared(graph_proto.name()); - if (graph == nullptr) { - GELOGE(GRAPH_FAILED, "ComputeGraph make shared failed"); - return false; - } + GE_CHK_BOOL_EXEC(graph != nullptr, return false, "ComputeGraph make shared failed"); /// 1. Decode all nodes first, node should include input /// and output nodes and nodes which represent sub graphs std::map node_map; diff --git a/src/common/graph/utils/ge_ir_utils.h b/src/common/graph/utils/ge_ir_utils.h index d18500a0..9b16be18 100644 --- a/src/common/graph/utils/ge_ir_utils.h +++ b/src/common/graph/utils/ge_ir_utils.h @@ -131,6 +131,10 @@ class OnnxUtils { static void AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const OpDescPtr &op_desc); + static void AddAttrProtoForAttrsFromAttrMap(const ::google::protobuf::Map &attr_map, + onnx::NodeProto *node_proto, const std::string &prefix = "", + const std::string &suffix = ""); + static void AddAttrProtoForAttrsFromOpDef(const ge::proto::OpDef *op_def, onnx::NodeProto *node_proto); static onnx::TensorProto_DataType EncodeDataType(ge::DataType data_type); @@ -172,10 +176,20 @@ class OnnxUtils { static void DecodeAttribute(const onnx::AttributeProto &attr_proto, std::string &value); + static void DecodeNodeAttributeForOpOutDesc(const onnx::AttributeProto &attr_proto, + const std::string &attr_name_for_output_desc, int32_t index, + OpDescPtr &op_desc); + + static void DecodeNodeAttributeForOpInDesc(const onnx::AttributeProto &attr_proto, + const std::string &attr_name_for_input_desc, int32_t index, + OpDescPtr &op_desc); + static void DecodeNodeAttributeForOpInAndOutDesc(const onnx::AttributeProto &attr_proto, const std::string &attr_name_for_input_output_desc, int32_t index, OpDescPtr &op_desc); + static void DecodeNodeAttributeForOpDef(const onnx::AttributeProto &attr_proto, ge::proto::OpDef &op_def); + static void DecodeNodeAttributeForOpDesc(const onnx::AttributeProto &attr_proto, OpDescPtr &op_desc); static bool DecodeNodeLinkImp(const NodeLinkInfo &item, NodePtr &node_ptr); diff --git a/src/common/graph/utils/graph_utils.cc b/src/common/graph/utils/graph_utils.cc index adb36db9..c5e45516 100644 --- a/src/common/graph/utils/graph_utils.cc +++ b/src/common/graph/utils/graph_utils.cc @@ -36,6 +36,10 @@ #include "utils/attr_utils.h" #include "utils/ge_ir_utils.h" #include "utils/node_utils.h" +#include "debug/ge_op_types.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" using google::protobuf::io::FileOutputStream; @@ -95,8 +99,16 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::AddEdge(c const InDataAnchorPtr &dst, const Format &dst_format) { if ((src != nullptr) && (src->LinkTo(dst) == GRAPH_SUCCESS)) { - (void)AnchorUtils::SetFormat(src, src_format); - (void)AnchorUtils::SetFormat(dst, dst_format); + auto ret = AnchorUtils::SetFormat(src, src_format); + if (ret != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Set format failed, format is %d", static_cast(src_format)); + return ret; + } + ret = AnchorUtils::SetFormat(dst, dst_format); + if (ret != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Set format failed,format is %d", static_cast(dst_format)); + return ret; + } return GRAPH_SUCCESS; } GELOGE(GRAPH_FAILED, "Add edge Failed."); @@ -203,6 +215,15 @@ GraphUtils::RemoveNodeWithoutRelink(const ComputeGraphPtr &compute_graph, const // If the node save as output node, delete it (void)compute_graph->RemoveOutputNode(node); + // If the node has sub-graphs, delete them + auto sub_graph_names = node->GetOpDesc()->GetSubgraphInstanceNames(); + if (!sub_graph_names.empty()) { + auto root_graph = FindRootGraph(compute_graph); + for (const auto &name : sub_graph_names) { + root_graph->RemoveSubgraph(name); + } + } + auto iter = find(compute_graph->nodes_.begin(), compute_graph->nodes_.end(), node); if (iter != compute_graph->nodes_.end()) { compute_graph->nodes_.erase(iter); @@ -216,6 +237,7 @@ GraphUtils::RemoveNodeWithoutRelink(const ComputeGraphPtr &compute_graph, const /// A ---> B transfered to A ---> N ---> B graphStatus InsertTransNode(ComputeGraph &compute_graph, const InDataAnchorPtr &in_data_anchor, const std::vector &vec_op_desc) { + GE_CHECK_NOTNULL(in_data_anchor); for (const auto &op_desc : vec_op_desc) { GE_CHECK_NOTNULL(op_desc); @@ -287,11 +309,28 @@ graphStatus InsertTransNode(ComputeGraph &compute_graph, const InDataAnchorPtr & "Vistor is empty"); GE_CHECK_NOTNULL(node_to_insert->GetOutDataAnchor(0)->GetPeerInDataAnchors().at(0)); - (void)AnchorUtils::SetFormat(node_to_insert->GetInDataAnchor(0)->GetPeerOutAnchor(), in_data_anchor_src_format); - (void)AnchorUtils::SetFormat(node_to_insert->GetInDataAnchor(0), (Format)input_format); - (void)AnchorUtils::SetFormat(node_to_insert->GetOutDataAnchor(0), (Format)output_format); - (void)AnchorUtils::SetFormat(node_to_insert->GetOutDataAnchor(0)->GetPeerInDataAnchors().at(0), - in_data_anchor_dst_format); + auto status = + AnchorUtils::SetFormat(node_to_insert->GetInDataAnchor(0)->GetPeerOutAnchor(), in_data_anchor_src_format); + if (status != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Set format failed,format is %d", static_cast(in_data_anchor_src_format)); + return status; + } + status = AnchorUtils::SetFormat(node_to_insert->GetInDataAnchor(0), static_cast(input_format)); + if (status != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Set format failed,format is %ld", input_format); + return status; + } + status = AnchorUtils::SetFormat(node_to_insert->GetOutDataAnchor(0), static_cast(output_format)); + if (status != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Set format failed,format is %ld", output_format); + return status; + } + status = AnchorUtils::SetFormat(node_to_insert->GetOutDataAnchor(0)->GetPeerInDataAnchors().at(0), + in_data_anchor_dst_format); + if (status != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Set format failed,format is %d", static_cast(in_data_anchor_dst_format)); + return status; + } } std::vector original_nodes; GraphUtils::RecordOriginalNames(original_nodes, node_to_insert); @@ -309,6 +348,71 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::InsertTra return ret; } +/// +/// @brief Insert node: src->insert_node:input_index, insert_node:output_index->dst +/// @param [in] src +/// @param [in] dsts +/// @param [in] insert_node +/// @param [in] input_index +/// @param [in] output_index +/// @return graphStatus +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus +GraphUtils::InsertNodeBefore(const OutDataAnchorPtr &src, const std::vector &dsts, + const NodePtr &insert_node, uint32_t input_index, uint32_t output_index) { + GE_CHECK_NOTNULL(src); + GE_CHECK_NOTNULL(insert_node); + + NodePtr src_node = src->GetOwnerNode(); + if (src_node->GetOwnerComputeGraph() != insert_node->GetOwnerComputeGraph()) { + GELOGE(GRAPH_FAILED, "src:%s and insert_node:%s not exist in the same graph.", src_node->GetName().c_str(), + insert_node->GetName().c_str()); + return GRAPH_FAILED; + } + + if (AddEdge(src, insert_node->GetInDataAnchor(input_index)) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "AddEdge %s->%s failed.", src_node->GetName().c_str(), insert_node->GetName().c_str()); + return GRAPH_FAILED; + } + + OutControlAnchorPtr src_out_ctrl_anchor = src_node->GetOutControlAnchor(); + GE_CHECK_NOTNULL(src_out_ctrl_anchor); + + for (auto &dst : dsts) { + GE_CHECK_NOTNULL(dst); + NodePtr dst_node = dst->GetOwnerNode(); + GE_CHECK_NOTNULL(dst_node); + GELOGI("Insert node %s between %s->%s.", insert_node->GetName().c_str(), src_node->GetName().c_str(), + dst_node->GetName().c_str()); + if (src_node->GetOwnerComputeGraph() != dst_node->GetOwnerComputeGraph()) { + GELOGE(GRAPH_FAILED, "src:%s and dst:%s not exist in the same graph.", src_node->GetName().c_str(), + dst_node->GetName().c_str()); + return GRAPH_FAILED; + } + + if ((RemoveEdge(src, dst) != GRAPH_SUCCESS) || + (AddEdge(insert_node->GetOutDataAnchor(output_index), dst) != GRAPH_SUCCESS)) { + GELOGE(GRAPH_FAILED, "ReplaceEdge from %s->%s to %s->%s failed.", src_node->GetName().c_str(), + dst_node->GetName().c_str(), insert_node->GetName().c_str(), dst_node->GetName().c_str()); + return GRAPH_FAILED; + } + + OutControlAnchorPtr new_out_ctrl_anchor = insert_node->GetOutControlAnchor(); + GE_CHECK_NOTNULL(new_out_ctrl_anchor); + for (InControlAnchorPtr peer_in_ctrl_anchor : src_out_ctrl_anchor->GetPeerInControlAnchors()) { + if ((RemoveEdge(src_out_ctrl_anchor, peer_in_ctrl_anchor) != GRAPH_SUCCESS) || + (AddEdge(new_out_ctrl_anchor, peer_in_ctrl_anchor) != GRAPH_SUCCESS)) { + GELOGE(GRAPH_FAILED, "ReplaceEdge from %s->%s to %s->%s failed.", src_node->GetName().c_str(), + peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str(), insert_node->GetName().c_str(), + peer_in_ctrl_anchor->GetOwnerNode()->GetName().c_str()); + return GRAPH_FAILED; + } + } + } + + return GRAPH_SUCCESS; +} + GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::RemoveJustNode(ComputeGraph &compute_graph, const NodePtr &node) { if (node == nullptr) { @@ -341,19 +445,19 @@ void GraphUtils::RecordOriginalNames(std::vector original_nodes, co GELOGE(GRAPH_FAILED, "Node %s get opdesc is nullptr", node_tmp->GetName().c_str()); continue; } - (void)ge::AttrUtils::GetListStr(opdesc_tmp, "original_op_names", names_tmp); + auto ret = ge::AttrUtils::GetListStr(opdesc_tmp, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, names_tmp); + if (!ret) { + GELOGW("Get list str failed"); + continue; + } if (names_tmp.size() != 0) { original_names.insert(original_names.end(), names_tmp.begin(), names_tmp.end()); } else { original_names.push_back(opdesc_tmp->GetName()); } } - if (original_names.size() == 0) { - std::string tmp; - original_names.push_back(tmp); - } - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(node->GetOpDesc(), "original_op_names", original_names), return, - "Set original_op_names fail."); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(node->GetOpDesc(), ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_names), + return, "Set original_op_names fail."); } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::RecordOriginalNames(std::vector names_tmp, @@ -361,57 +465,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::RecordOriginalNa GE_CHK_BOOL_EXEC(node != nullptr, return, "node is null."); std::vector original_names; if (names_tmp.size() != 0) { - (void)original_names.insert(original_names.end(), names_tmp.begin(), names_tmp.end()); + original_names.insert(original_names.end(), names_tmp.begin(), names_tmp.end()); } else { std::string tmp; original_names.push_back(tmp); } - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(node->GetOpDesc(), "original_op_names", original_names), return, - "Set original_op_names fail."); -} - -// Check global_step Node has IsVariable and Read. -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::CheckGlobalStepNode(const ge::NodePtr &node) { - GE_CHK_BOOL_EXEC( - node != nullptr, { return false; }, "node is null."); - bool has_variable = false; - bool has_cond_read = false; - for (const auto &out : node->GetOutDataNodes()) { - if ((out->GetType() == "VarIsInitializedOp") && (out->GetName() == "global_step/IsVariableInitialized")) { - has_variable = true; - } else if ((out->GetType() == "FrameworkOp") && (out->GetName() == "global_step/cond/read/Switch")) { - has_cond_read = true; - } - } - return (has_variable && has_cond_read); -} - -// Check origin ComputeGraph is TrainGraph. -GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::CheckIsTrainGraph( - const ge::ComputeGraphPtr &compute_graph) { - GE_CHK_BOOL_EXEC( - compute_graph != nullptr, { return false; }, "compute_graph is nullptr"); - - bool is_iterator_v2 = false; - bool is_train_graph = false; - for (const auto &node : compute_graph->GetDirectNode()) { - if ((node->GetType() == "ApplyMomentum") || (node->GetType() == "ApplyGradientDescent")) { - GELOGI("graph needs iteration."); - return true; - } - // Check global_step has IsVariable and Read. - if ((node->GetType() == "Variable") && (node->GetName() == "global_step")) { - is_train_graph = CheckGlobalStepNode(node); - } else if ((node->GetType() == "FrameworkOp") && (node->GetName() == "IteratorGetNext")) { - // Train Graph must have GetNext. - is_iterator_v2 = true; - } - if (is_iterator_v2 && is_train_graph) { - break; - } - } - GELOGI("Generate: compute_graph is_iterator_v2[%d], is_train_graph[%d].", is_iterator_v2, is_train_graph); - return (is_iterator_v2 && is_train_graph); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(node->GetOpDesc(), ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_names), + return, "Set original_op_names fail."); } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::MatchDumpStr(const std::string &suffix) { @@ -474,7 +534,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraph(cons ge::Model model("", ""); model.SetGraph(GraphUtils::CreateGraphFromComputeGraph(std::const_pointer_cast(graph))); Buffer buffer; - model.Save(buffer); + model.Save(buffer, true); // Write file ge::proto::ModelDef ge_proto; @@ -524,7 +584,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToText const int FILE_AUTHORITY = 0600; int fd = open(real_path, O_WRONLY | O_CREAT | O_TRUNC, FILE_AUTHORITY); if (fd < 0) { - GELOGE(GRAPH_FAILED, "fail to open the file: %s", real_path); + GELOGE(GRAPH_FAILED, "fail to open the file: %s, %s", real_path, strerror(errno)); return; } google::protobuf::io::FileOutputStream *output = new (std::nothrow) FileOutputStream(fd); @@ -556,6 +616,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::WriteProtoToText static int64_t maxDumpFileSize = 0; if (maxDumpFileSize == 0) { string opt = "0"; + // Can not check return value (void)GetContext().GetOption("ge.maxDumpFileSize", opt); maxDumpFileSize = atol(opt.c_str()); } @@ -608,7 +669,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraphToOnn return; } - // 1.Get onnx::ModelProto from ge::Model + // 1.Get ge::onnx::ModelProto from ge::Model ge::Model model("GE", ""); std::shared_ptr compute_graph_ptr = ComGraphMakeShared(compute_graph); model.SetGraph(GraphUtils::CreateGraphFromComputeGraph(std::const_pointer_cast(compute_graph_ptr))); @@ -638,6 +699,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void GraphUtils::DumpGEGraphToOnn /// setw(5) is for formatted sort std::stringstream stream_file_name; stream_file_name << "ge_onnx_" << std::setw(5) << std::setfill('0') << file_index; + stream_file_name << "_graph_" << compute_graph.GetGraphID(); stream_file_name << "_" << suffix << ".pbtxt"; std::string proto_file = stream_file_name.str(); if ((proto_file.length()) >= NAME_MAX) { @@ -937,32 +999,48 @@ graphStatus ReplaceInDataAnchors(const Node::Vistor &new_ins, graphStatus ReplaceControlAnchors(const NodePtr &new_node, const NodePtr &old_node) { GE_CHECK_NOTNULL(new_node); - GE_CHECK_NOTNULL(old_node); GE_CHECK_NOTNULL(new_node->GetInControlAnchor()); + GE_CHECK_NOTNULL(old_node); GE_CHECK_NOTNULL(old_node->GetInControlAnchor()); auto peer_out_anchors = old_node->GetInControlAnchor()->GetPeerAnchors(); auto new_in_control_anchor = new_node->GetInControlAnchor(); + auto exists_out_anchors = new_in_control_anchor->GetPeerAnchors(); + auto exists_out_anchors_set = std::set(exists_out_anchors.begin(), exists_out_anchors.end()); for (const auto &peer_out_anchor : peer_out_anchors) { if (peer_out_anchor != nullptr) { + if (exists_out_anchors_set.count(peer_out_anchor) > 0) { + continue; + } auto ret = GraphUtils::AddEdge(peer_out_anchor, new_in_control_anchor); if (ret != GRAPH_SUCCESS) { GELOGE(GRAPH_FAILED, "Add edge failed"); return GRAPH_FAILED; } + } else { + GELOGW("peer outanchor is nullptr"); + continue; } } auto old_out_control_anchor = old_node->GetOutControlAnchor(); GE_CHECK_NOTNULL(old_out_control_anchor); auto peer_in_anchors = old_out_control_anchor->GetPeerAnchors(); auto new_out_control_anchor = new_node->GetOutControlAnchor(); + auto exists_in_anchors = new_out_control_anchor->GetPeerAnchors(); + auto exists_in_anchors_set = std::set(exists_in_anchors.begin(), exists_in_anchors.end()); GE_CHECK_NOTNULL(new_out_control_anchor); for (const auto &peer_in_anchor : peer_in_anchors) { if (peer_in_anchor != nullptr) { + if (exists_in_anchors_set.count(peer_in_anchor) > 0) { + continue; + } auto ret = GraphUtils::AddEdge(new_out_control_anchor, peer_in_anchor); if (ret != GRAPH_SUCCESS) { GELOGE(GRAPH_FAILED, "Add edge failed"); return GRAPH_FAILED; } + } else { + GELOGW("Peer inanchor is nullptr"); + continue; } } @@ -976,9 +1054,11 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::IsolateNo GELOGE(GRAPH_PARAM_INVALID, "Failed to isolate node(null)"); return GRAPH_PARAM_INVALID; } + /// We must get full connections info before re-link data io, because the data /// edges may be unlinked when relink data io auto in_nodes_to_out = GetFullConnectIONodes(node); + InNodesToOut data_in_to_out; auto ret = RelinkDataIO(node, io_map, data_in_to_out); if (ret != GRAPH_SUCCESS) { @@ -1185,4 +1265,738 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::AppendInp graph->inputs_order_.emplace_back(node->GetName()); return GRAPH_SUCCESS; } + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraphPtr GraphUtils::FindRootGraph(ComputeGraphPtr graph) { + ComputeGraphPtr result = nullptr; + while (graph != nullptr) { + result = std::move(graph); + graph = result->GetParentGraph(); + } + return result; +} + +/// +/// @brief Add node to graph +/// @param [in] op_desc +/// @return ComputeGraphBuilder +/// +ComputeGraphBuilder &ComputeGraphBuilder::AddNode(const OpDescPtr &op_desc) { + nodes_.emplace_back(op_desc); + return *this; +} + +/// +/// @brief Add data-link among nodes in graph +/// @param [in] src_name +/// @param [in] out_anchor_ind +/// @param [in] dst_name +/// @param [in] in_anchor_ind +/// @return ComputeGraphBuilder +/// +ComputeGraphBuilder &ComputeGraphBuilder::AddDataLink(const std::string &src_name, uint32_t out_anchor_ind, + const std::string &dst_name, uint32_t in_anchor_ind) { + data_links_.emplace_back( + std::make_pair(std::make_pair(src_name, out_anchor_ind), std::make_pair(dst_name, in_anchor_ind))); + return *this; +} + +/// +/// @brief Add ctrl-link among nodes in graph +/// @param [in] src_name +/// @param [in] dst_name +/// @return ComputeGraphBuilder +/// +ComputeGraphBuilder &ComputeGraphBuilder::AddControlLink(const std::string &src_name, const std::string &dst_name) { + ctrl_links_.emplace_back(std::make_pair(src_name, dst_name)); + return *this; +} + +/// +/// @brief Build nodes +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +void ComputeGraphBuilder::BuildNodes(graphStatus &error_code, std::string &error_msg) { + if (owner_graph_ == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "graph is NULL."; + return; + } + + std::string node_name; + for (auto &op_desc : nodes_) { + if (op_desc == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "op_desc is NULL."; + return; + } + + node_name = op_desc->GetName(); + NodePtr node = owner_graph_->AddNode(op_desc); + if (node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "Add node " + node_name + " failed."; + return; + } + + GELOGD("Add node name:%s, type:%s.", node_name.c_str(), op_desc->GetType().c_str()); + node_names_[node_name] = node; + } + + GELOGD("BuildNodes succ."); +} + +/// +/// @brief Build data-links +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +void ComputeGraphBuilder::BuildDataLinks(graphStatus &error_code, std::string &error_msg) { + for (auto &pair : data_links_) { + std::string src_name = pair.first.first; + uint32_t out_ind = pair.first.second; + std::string dst_name = pair.second.first; + uint32_t in_ind = pair.second.second; + std::string log_msg = "Add data-edge "; + log_msg.append(src_name) + .append(":") + .append(std::to_string(out_ind)) + .append("->") + .append(dst_name) + .append(":") + .append(std::to_string(in_ind)); + + auto src_iter = node_names_.find(src_name); + auto dst_iter = node_names_.find(dst_name); + if ((src_iter == node_names_.end()) || (dst_iter == node_names_.end())) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed: node not exist in graph."; + return; + } + + NodePtr src_node = node_names_[src_name]; + NodePtr dst_node = node_names_[dst_name]; + if ((src_node == nullptr) || (dst_node == nullptr)) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed: node is NULL."; + return; + } + + if (GraphUtils::AddEdge(src_node->GetOutDataAnchor(out_ind), dst_node->GetInDataAnchor(in_ind)) != GRAPH_SUCCESS) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed."; + return; + } + + GELOGD("%s succ.", log_msg.c_str()); + } + + GELOGD("BuildDataLinks succ."); +} + +/// +/// @brief Build ctrl-links +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +void ComputeGraphBuilder::BuildCtrlLinks(graphStatus &error_code, std::string &error_msg) { + for (auto &pair : ctrl_links_) { + std::string src_name = pair.first; + std::string dst_name = pair.second; + std::string log_msg = "Add ctrl-edge "; + log_msg.append(src_name).append("->").append(dst_name); + + auto src_iter = node_names_.find(src_name); + auto dst_iter = node_names_.find(dst_name); + if ((src_iter == node_names_.end()) || (dst_iter == node_names_.end())) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed: node not exist in graph."; + return; + } + + NodePtr src_node = node_names_[src_name]; + NodePtr dst_node = node_names_[dst_name]; + if ((src_node == nullptr) || (dst_node == nullptr)) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed: node is NULL."; + return; + } + + if (GraphUtils::AddEdge(src_node->GetOutControlAnchor(), dst_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed."; + return; + } + + GELOGD("%s succ.", log_msg.c_str()); + } + + GELOGD("BuildCtrlLinks succ."); +} + +/// @brief Get node with name +/// @param [in] name +/// @return NodePtr +/// +NodePtr ComputeGraphBuilder::GetNode(const std::string &name) { + auto iter = node_names_.find(name); + if (iter == node_names_.end()) { + GE_LOGE("node %s not exist.", name.c_str()); + return nullptr; + } + return iter->second; +} + +/// +/// @brief Add node to graph +/// @param [in] op_desc +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::AddNode(const OpDescPtr &op_desc) { + ComputeGraphBuilder::AddNode(op_desc); + return *this; +} + +/// +/// @brief Add data-link among nodes in graph +/// @param [in] src_name +/// @param [in] out_anchor_ind +/// @param [in] dst_name +/// @param [in] in_anchor_ind +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::AddDataLink(const std::string &src_name, uint32_t out_anchor_ind, + const std::string &dst_name, uint32_t in_anchor_ind) { + ComputeGraphBuilder::AddDataLink(src_name, out_anchor_ind, dst_name, in_anchor_ind); + return *this; +} + +/// +/// @brief Add ctrl-link among nodes in graph +/// @param [in] src_name +/// @param [in] dst_name +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::AddControlLink(const std::string &src_name, const std::string &dst_name) { + ComputeGraphBuilder::AddControlLink(src_name, dst_name); + return *this; +} + +/// +/// @brief Set index_th input anchor for graph +/// @param [in] index +/// @param [in] node_names +/// @param [in] anchor_inds +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::SetInput(uint32_t index, const std::vector &node_names, + const std::vector &anchor_inds) { + graph_inputs_[index] = std::make_pair(node_names, anchor_inds); + return *this; +} + +/// +/// @brief Set index_th input of graph as useless +/// @param [in] index +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::SetUselessInput(uint32_t index) { + graph_inputs_[index] = std::make_pair(std::vector(), std::vector()); + return *this; +} + +/// +/// @brief Add output anchor for graph +/// @param [in] owner_node_name +/// @param [in] anchor_ind +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::AddOutput(const std::string &owner_node_name, uint32_t anchor_ind) { + graph_outputs_.emplace_back(std::make_pair(owner_node_name, anchor_ind)); + return *this; +} + +/// +/// @brief Set parent-node of graph +/// @param [in] parent_node +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::SetParentNode(const NodePtr &parent_node) { + parent_node_ = parent_node; + return *this; +} + +/// +/// @brief Set mapping-relation of parent-node in_anchor_ind & Data-node +/// @param [in] input_mapping: index_of_graph_input -> in_anchor_index_of_parent_node +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::SetInputMapping(const std::map &input_mapping) { + for (auto &item : input_mapping) { + input_mapping_[item.first] = item.second; + } + return *this; +} + +/// +/// @brief Set mapping-relation of parent-node out_anchor_ind & NetOutput-node out_anchor_ind +/// @param [in] output_mapping: index_of_graph_output -> out_anchor_index_of_parent_node +/// @return CompleteGraphBuilder +/// +CompleteGraphBuilder &CompleteGraphBuilder::SetOutputMapping(const std::map &output_mapping) { + for (auto &item : output_mapping) { + output_mapping_[item.first] = item.second; + } + return *this; +} + +/// +/// @brief Build graph +/// @param [out] error_code +/// @param [out] error_msg +/// @return ComputeGraphPtr +/// +ComputeGraphPtr CompleteGraphBuilder::Build(graphStatus &error_code, std::string &error_msg) { + owner_graph_ = shared_ptr(new (std::nothrow) ComputeGraph(name_)); + if (owner_graph_ == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "graph is NULL."; + return nullptr; + } + + owner_graph_->SetParentNode(parent_node_); + + BuildNodes(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildDataLinks(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildCtrlLinks(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildInputs(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildOutputs(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + if (AddNetOutputNode(error_code, error_msg) == nullptr) { + return nullptr; + } + + return owner_graph_; +} + +/// +/// @brief Build inputs +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +void CompleteGraphBuilder::BuildInputs(graphStatus &error_code, std::string &error_msg) { + for (auto &input : graph_inputs_) { + NodePtr data_node = AddDateNode(input.first, error_code, error_msg); + if (data_node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: add node Data:" + std::to_string(input.first) + +" failed."; + return; + } + + if (owner_graph_->AddInputNode(data_node) == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: add input node Data:" + std::to_string(input.first) + +" failed."; + return; + } + + // useless input + std::vector input_names = input.second.first; + std::vector anchor_indes = input.second.second; + if (input_names.size() != anchor_indes.size()) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: num of input_names and indexs not equal."; + return; + } + if (input_names.empty()) { + continue; + } + + size_t input_num = input_names.size(); + for (size_t i = 0; i < input_num; i++) { + std::string input_name = input_names[i]; + uint32_t ind = anchor_indes[i]; + auto iter = node_names_.find(input_name); + if (iter == node_names_.end()) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: node " + input_name + " not exist in graph."; + return; + } + + NodePtr in_node = node_names_[input_name]; + if (in_node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: node " + input_name + " is NULL."; + return; + } + + if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(0), in_node->GetInDataAnchor(ind)) != GRAPH_SUCCESS) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: add data-edge Data:" + std::to_string(input.first) + ":0->" + input_name + + ":" + std::to_string(ind) + " failed."; + return; + } + } + + GELOGD("BuildInputs : Add %u input succ.", input.first); + } + + GELOGD("BuildInputs succ."); +} + +/// +/// @brief Add data node +/// @param [in] index +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +NodePtr CompleteGraphBuilder::AddDateNode(uint32_t index, graphStatus &error_code, std::string &error_msg) { + std::string data_name = "Data_" + std::to_string(index); + OpDescBuilder op_desc_builder(data_name, "Data"); + OpDescPtr op_desc = op_desc_builder.AddInput("x").AddOutput("y").Build(); + if (op_desc == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: create op_desc " + data_name + " failed."; + return nullptr; + } + + auto index_iter = input_mapping_.find(index); + if (index_iter != input_mapping_.end()) { + if (!ge::AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, index_iter->second)) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: set attr ATTR_NAME_PARENT_NODE_INDEX for " + data_name + " failed."; + return nullptr; + } + } + + NodePtr data_node = owner_graph_->AddNode(op_desc); + if (data_node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildInputs failed: add node " + data_name + " failed."; + return nullptr; + } + + return data_node; +} + +/// +/// @brief Build outputs +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +void CompleteGraphBuilder::BuildOutputs(graphStatus &error_code, std::string &error_msg) { + std::map> out_nodes_map; + std::vector> out_nodes_info; + for (auto &pair : graph_outputs_) { + std::string output = pair.first; + int32_t ind = pair.second; + auto out_iter = node_names_.find(output); + if (out_iter == node_names_.end()) { + error_code = GRAPH_FAILED; + error_msg = "BuildOutputs failed: node " + output + " not exist in graph."; + return; + } + + NodePtr out_node = node_names_[output]; + if (out_node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildOutputs failed: node " + output + " is NULL."; + return; + } + + OutDataAnchorPtr out_anchor = out_node->GetOutDataAnchor(ind); + if (out_anchor == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "BuildOutputs failed: anchor " + output + ":" + std::to_string(ind) + " is NULL."; + return; + } + + auto iter = out_nodes_map.find(output); + if (iter == out_nodes_map.end()) { + std::vector vec = {ind}; + out_nodes_map[output] = vec; + } else { + out_nodes_map[output].emplace_back(ind); + } + out_nodes_info.emplace_back(std::make_pair(out_node, ind)); + + GELOGD("BuildOutputs : AddOutputAnchor %s:%u succ.", output.c_str(), ind); + } + + owner_graph_->SetGraphOutNodes(out_nodes_map); + owner_graph_->SetGraphOutNodesInfo(out_nodes_info); + GELOGD("BuildOutputs succ."); +} + +/// +/// @brief Add NetOutput node +/// @param [out] error_code +/// @param [out] error_msg +/// @return NodePtr +/// +NodePtr CompleteGraphBuilder::AddNetOutputNode(graphStatus &error_code, std::string &error_msg) { + std::string log_msg = "AddNetOutputNode name:" + std::string(kNodeNameNetOutput) + ", type:" + NETOUTPUT; + OpDescPtr net_output_desc = shared_ptr(new (std::nothrow) OpDesc(kNodeNameNetOutput, NETOUTPUT)); + if (net_output_desc == nullptr) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed: op_desc is NULL."; + return nullptr; + } + + std::vector> out_nodes_info = owner_graph_->GetGraphOutNodesInfo(); + error_code = BuildInOutForNetOutput(out_nodes_info, net_output_desc); + if (error_code != GRAPH_SUCCESS) { + error_msg = log_msg + " failed: add input/output tensor failed."; + return nullptr; + } + + NodePtr net_output_node = owner_graph_->AddNode(net_output_desc); + if (net_output_node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = log_msg + " failed: add node failed."; + return nullptr; + } + + error_code = AddEdgeForNetOutput(out_nodes_info, net_output_node); + if (error_code != GRAPH_SUCCESS) { + error_msg = log_msg + " failed: link edge failed."; + return nullptr; + } + + GELOGD("%s succ.", log_msg.c_str()); + return net_output_node; +} + +/// +/// @brief Add input/output tensor for NetOutput node +/// @param [in] out_nodes_info +/// @param [out] net_output_desc +/// @return graphStatus +/// +graphStatus CompleteGraphBuilder::BuildInOutForNetOutput(const std::vector> &out_nodes_info, + OpDescPtr &net_output_desc) { + size_t output_num = out_nodes_info.size(); + for (size_t i = 0; i < output_num; i++) { + NodePtr src_node = out_nodes_info[i].first; + uint32_t src_index = out_nodes_info[i].second; + if ((src_node == nullptr) || (src_node->GetOpDesc() == nullptr)) { + GE_LOGE("AddInOutForNetOutputOp failed: src_node is NULL."); + return GRAPH_FAILED; + } + + ge::GeTensorDesc in_desc = src_node->GetOpDesc()->GetOutputDesc(src_index); + auto iter = output_mapping_.find(i); + if (iter != output_mapping_.end()) { + if (!ge::AttrUtils::SetInt(in_desc, ATTR_NAME_PARENT_NODE_INDEX, iter->second)) { + GE_LOGE("AddInOutForNetOutputOp failed: set attr ATTR_NAME_PARENT_NODE_INDEX failed."); + return GRAPH_FAILED; + } + } + + if (net_output_desc->AddInputDesc(in_desc) != SUCCESS) { + GE_LOGE("AddInOutForNetOutputOp failed: add input_desc failed."); + return GRAPH_FAILED; + } + + ge::GeTensorDesc out_desc = src_node->GetOpDesc()->GetOutputDesc(src_index); + TensorUtils::SetOutputTensor(out_desc, true); + if (net_output_desc->AddOutputDesc(out_desc) != SUCCESS) { + GE_LOGE("AddInOutForNetOutputOp failed: add output_desc failed."); + return GRAPH_FAILED; + } + } + + GELOGD("Add input/output tensor for NetOutput node succ."); + return GRAPH_SUCCESS; +} + +/// +/// @brief Add edge for NetOutput node +/// @param [in] out_nodes_info +/// @param [out] net_output_node +/// @return graphStatus +/// +graphStatus CompleteGraphBuilder::AddEdgeForNetOutput(const std::vector> &out_nodes_info, + const NodePtr &net_output_node) { + if (net_output_node == nullptr) { + GE_LOGE("AddEdgeForNetOutputOp failed: NetOutput is NULL."); + return GRAPH_FAILED; + } + + size_t out_num = out_nodes_info.size(); + for (size_t i = 0; i < out_num; i++) { + NodePtr src_node = out_nodes_info[i].first; + uint32_t ind = out_nodes_info[i].second; + if (src_node == nullptr) { + GE_LOGE("AddEdgeForNetOutputOp failed: src_node is NULL."); + return GRAPH_FAILED; + } + + if (GraphUtils::AddEdge(src_node->GetOutDataAnchor(ind), net_output_node->GetInDataAnchor(i)) != GRAPH_SUCCESS) { + GE_LOGE("Add data-edge %s:%u->%s:%zu failed.", src_node->GetName().c_str(), ind, + net_output_node->GetName().c_str(), i); + return GRAPH_FAILED; + } + } + + std::vector leaf_nodes; + for (auto &node : owner_graph_->GetDirectNode()) { + if (node->GetOutNodes().empty()) { + leaf_nodes.emplace_back(node); + } + } + for (auto &node : leaf_nodes) { + if (GraphUtils::AddEdge(node->GetOutControlAnchor(), net_output_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + GE_LOGE("Add ctrl-edge %s->%s failed.", node->GetName().c_str(), net_output_node->GetName().c_str()); + return GRAPH_FAILED; + } + } + + GELOGD("Add edge for NetOutput node succ."); + return GRAPH_SUCCESS; +} + +/// +/// @brief Add node to graph +/// @param [in] op_desc +/// @return PartialGraphBuilder +/// +PartialGraphBuilder &PartialGraphBuilder::AddNode(const OpDescPtr &op_desc) { + ComputeGraphBuilder::AddNode(op_desc); + return *this; +} + +/// +/// @brief Add data-link among nodes in graph +/// @param [in] src_name +/// @param [in] out_anchor_ind +/// @param [in] dst_name +/// @param [in] in_anchor_ind +/// @return PartialGraphBuilder +/// +PartialGraphBuilder &PartialGraphBuilder::AddDataLink(const std::string &src_name, uint32_t out_anchor_ind, + const std::string &dst_name, uint32_t in_anchor_ind) { + ComputeGraphBuilder::AddDataLink(src_name, out_anchor_ind, dst_name, in_anchor_ind); + return *this; +} + +/// +/// @brief Add ctrl-link among nodes in graph +/// @param [in] src_name +/// @param [in] dst_name +/// @return PartialGraphBuilder +/// +PartialGraphBuilder &PartialGraphBuilder::AddControlLink(const std::string &src_name, const std::string &dst_name) { + ComputeGraphBuilder::AddControlLink(src_name, dst_name); + return *this; +} + +/// +/// @brief Set owner graph +/// @param [in] graph +/// @return PartialGraphBuilder +/// +PartialGraphBuilder &PartialGraphBuilder::SetOwnerGraph(const ComputeGraphPtr &graph) { + owner_graph_ = graph; + return *this; +} + +/// +/// @brief Add exist node +/// @param [in] node +/// @return PartialGraphBuilder +/// +PartialGraphBuilder &PartialGraphBuilder::AddExistNode(const NodePtr &node) { + exist_nodes_.emplace_back(node); + return *this; +} + +/// +/// @brief Build partial graph +/// @param [out] error_code +/// @param [out] error_msg +/// @return ComputeGraphPtr +/// +ComputeGraphPtr PartialGraphBuilder::Build(graphStatus &error_code, std::string &error_msg) { + if (owner_graph_ == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "graph is NULL."; + return nullptr; + } + + BuildNodes(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildExistNodes(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildDataLinks(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + BuildCtrlLinks(error_code, error_msg); + if (error_code != GRAPH_SUCCESS) { + return nullptr; + } + + return owner_graph_; +} + +/// +/// @brief Build exist nodes +/// @param [out] error_code +/// @param [out] error_msg +/// @return void +/// +void PartialGraphBuilder::BuildExistNodes(graphStatus &error_code, std::string &error_msg) { + std::string node_name; + for (auto &node : exist_nodes_) { + if (node == nullptr) { + error_code = GRAPH_FAILED; + error_msg = "Build exist nodes failed: node is NULL."; + return; + } + + node_name = node->GetName(); + if (node->GetOwnerComputeGraph() != owner_graph_) { + error_code = GRAPH_FAILED; + error_msg = "Build exist nodes failed: node " + node_name + " not belongs to this graph."; + return; + } + + GELOGD("Add exist_node name:%s.", node_name.c_str()); + node_names_[node_name] = node; + } + + GELOGD("Build exist nodes succ."); +} } // namespace ge diff --git a/src/common/graph/utils/node_utils.cc b/src/common/graph/utils/node_utils.cc index ae87435c..52d81e43 100644 --- a/src/common/graph/utils/node_utils.cc +++ b/src/common/graph/utils/node_utils.cc @@ -15,10 +15,12 @@ */ #include "utils/node_utils.h" +#include "graph/utils/graph_utils.h" #include "debug/ge_op_types.h" #include "debug/ge_util.h" #include "framework/common/debug/ge_log.h" #include "graph/anchor.h" +#include "graph/debug/ge_attr_define.h" #include "utils/tensor_utils.h" #include "utils/type_utils.h" @@ -109,6 +111,7 @@ graphStatus NodeUtils::GetDataOutAnchorAndControlInAnchor(const NodePtr &node_pt graphStatus NodeUtils::ClearInDataAnchor(const NodePtr &node_ptr, const InDataAnchorPtr &in_data_anchor) { GE_CHK_BOOL_EXEC(node_ptr != nullptr && in_data_anchor != nullptr, return GRAPH_FAILED, "node or in_data_anchor is nullptr"); + bool find_flag = false; uint32_t index = 0; vector::iterator it = node_ptr->in_data_anchors_.end(); @@ -358,4 +361,45 @@ graphStatus NodeUtils::UpdateInputShape(const Node &node, uint32_t index, const input_desc->SetShape(shape); return GRAPH_SUCCESS; } +std::string NodeUtils::GetNodeType(const Node &node) { + if (node.GetType() != FRAMEWORKOP) { + return node.GetType(); + } + std::string type; + (void)AttrUtils::GetStr(node.GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type); + return type; +} +ComputeGraphPtr NodeUtils::GetSubgraph(const Node &node, uint32_t index) { + auto op_desc = node.GetOpDesc(); + if (op_desc == nullptr) { + return nullptr; + } + auto root_graph = GraphUtils::FindRootGraph(node.GetOwnerComputeGraph()); + if (root_graph == nullptr) { + return nullptr; + } + return root_graph->GetSubgraph(op_desc->GetSubgraphInstanceName(index)); +} + +graphStatus NodeUtils::AddSubgraph(Node &node, const ComputeGraphPtr &subgraph) { + if (subgraph == nullptr) { + GE_LOGE("Failed to add subgraph to node %s, null subgraph", node.GetName().c_str()); + return GRAPH_PARAM_INVALID; + } + auto op_desc = node.GetOpDesc(); + if (op_desc == nullptr) { + return GRAPH_PARAM_INVALID; + } + auto root_graph = GraphUtils::FindRootGraph(node.GetOwnerComputeGraph()); + if (root_graph == nullptr) { + GE_LOGE("Failed to add subgraph to node %s, null root graph", node.GetName().c_str()); + return GRAPH_PARAM_INVALID; + } + op_desc->AddSubgraphInstanceName(subgraph->GetName()); + subgraph->SetParentNode(node.shared_from_this()); + subgraph->SetParentGraph(node.GetOwnerComputeGraph()); + root_graph->AddSubgraph(subgraph); + + return GRAPH_SUCCESS; +} } // namespace ge diff --git a/src/common/graph/utils/op_desc_utils.cc b/src/common/graph/utils/op_desc_utils.cc index f2214350..89175b56 100644 --- a/src/common/graph/utils/op_desc_utils.cc +++ b/src/common/graph/utils/op_desc_utils.cc @@ -15,9 +15,7 @@ */ #include "utils/op_desc_utils.h" - #include - #include "debug/ge_attr_define.h" #include "debug/ge_op_types.h" #include "debug/ge_util.h" @@ -209,6 +207,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector OpDescUtils:: GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector OpDescUtils::GetInputData( const vector &input_nodes) { vector ret; + for (const auto &input_node : input_nodes) { auto temp_weight = MutableWeights(input_node->GetOpDesc()); if (temp_weight == nullptr) { @@ -379,7 +378,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector OpDescUt if (NodeUtils::IsAnchorStatusSet(*node)) { for (const auto &in_anchor : node->GetAllInDataAnchors()) { if (ge::AnchorUtils::GetStatus(in_anchor) == ANCHOR_DATA) { - (void)ret.push_back(node->GetOpDesc()->GetInputDesc(in_anchor->GetIdx())); + ret.push_back(node->GetOpDesc()->GetInputDesc(in_anchor->GetIdx())); } } } else { @@ -389,7 +388,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector OpDescUt continue; } if (out_anchor->GetOwnerNode()->GetOpDesc()->GetType() != CONSTANT) { - (void)ret.push_back(node->GetOpDesc()->GetInputDesc(in_anchor->GetIdx())); + ret.push_back(node->GetOpDesc()->GetInputDesc(in_anchor->GetIdx())); } } } @@ -572,4 +571,80 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus OpDescUtils::ClearWei } return GRAPH_SUCCESS; } + +/// +/// @brief Add input +/// @param [in] name +/// @return OpDescBuilder +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescBuilder &OpDescBuilder::AddInput(const std::string &name) { + inputs_.emplace_back(name); + return *this; +} + +/// +/// @brief Add dynamic input +/// @param [in] name +/// @param [in] num +/// @return OpDescBuilder +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescBuilder &OpDescBuilder::AddDynamicInput(const std::string &name, + uint32_t num) { + for (uint32_t i = 0; i < num; i++) { + inputs_.emplace_back(name + std::to_string(i)); + } + return *this; +} + +/// +/// @brief Add output +/// @param [in] name +/// @return OpDescBuilder +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescBuilder &OpDescBuilder::AddOutput(const std::string &name) { + outputs_.emplace_back(name); + return *this; +} + +/// +/// @brief Add dynamic output +/// @param [in] name +/// @param [in] num +/// @return OpDescBuilder +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescBuilder &OpDescBuilder::AddDynamicOutput(const std::string &name, + uint32_t num) { + for (uint32_t i = 0; i < num; i++) { + outputs_.emplace_back(name + std::to_string(i)); + } + return *this; +} + +/// +/// @brief Build op_desc +/// @return OpDescPtr +/// +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY OpDescPtr OpDescBuilder::Build() { + OpDescPtr op_desc = shared_ptr(new (std::nothrow) OpDesc(name_, type_)); + if (op_desc == nullptr) { + GELOGE(GRAPH_FAILED, "OpDesc is nullptr"); + return nullptr; + } + + for (auto &input : inputs_) { + if (op_desc->AddInputDesc(input, GeTensorDesc()) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Add input_desc failed."); + return nullptr; + } + } + + for (auto &output : outputs_) { + if (op_desc->AddOutputDesc(output, GeTensorDesc()) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Add output_desc failed."); + return nullptr; + } + } + + return op_desc; +} } // namespace ge diff --git a/src/common/graph/utils/tensor_utils.cc b/src/common/graph/utils/tensor_utils.cc index 819f5d58..072673c0 100644 --- a/src/common/graph/utils/tensor_utils.cc +++ b/src/common/graph/utils/tensor_utils.cc @@ -15,7 +15,6 @@ */ #include "graph/utils/tensor_utils.h" - #include #include "debug/ge_log.h" @@ -276,6 +275,14 @@ static graphStatus CalcTensorElementCnt(const std::vector &dims, Format break; case FORMAT_FRACTAL_NZ: case FORMAT_FRACTAL_ZZ: + case FORMAT_NDHWC: + case FORMAT_NCDHW: + case FORMAT_DHWCN: + case FORMAT_DHWNC: + case FORMAT_FRACTAL_Z_3D: + case FORMAT_FRACTAL_Z_3D_TRANSPOSE: + case FORMAT_NDC1HWC0: + case FORMAT_FRACTAL_Z_C04: graph_status = CalcElementCntByDims(dims, element_cnt); break; default: @@ -351,21 +358,21 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus TensorUtils::CalcTens } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus -TensorUtils::GetTensorMemorySizeInBytes(const GeTensorDesc &desc_temp, uint32_t &size_temp) { +TensorUtils::GetTensorMemorySizeInBytes(const GeTensorDesc &desc_temp, int64_t &size_temp) { graphStatus graph_status = GetTensorSizeInBytes(desc_temp, size_temp); if (graph_status != GRAPH_SUCCESS) { return GRAPH_FAILED; } // 64-byte alignment, if size is 0, align to 32 bytes - if (size_temp > (UINT32_MAX - kNum2 * kDataMemAlignSize)) { - GELOGW("The updated mem size %u is bigger than UINT32_MAX", size_temp); + if (size_temp > (INT64_MAX - kNum2 * kDataMemAlignSize)) { + GELOGW("The updated mem size %ld is bigger than INT64_MAX", size_temp); } else { size_temp = ((size_temp + kNum2 * kDataMemAlignSize - 1) / kDataMemAlignSize) * kDataMemAlignSize; } return GRAPH_SUCCESS; } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus -TensorUtils::GetTensorSizeInBytes(const GeTensorDesc &desc_temp, uint32_t &size_temp) { +TensorUtils::GetTensorSizeInBytes(const GeTensorDesc &desc_temp, int64_t &size_temp) { GeShape output_shape = desc_temp.GetShape(); Format format = desc_temp.GetFormat(); DataType data_type = desc_temp.GetDataType(); @@ -376,13 +383,13 @@ TensorUtils::GetTensorSizeInBytes(const GeTensorDesc &desc_temp, uint32_t &size_ return GRAPH_FAILED; } - if ((output_mem_size > UINT32_MAX) || (output_mem_size < 0)) { - GELOGE(GRAPH_FAILED, "After calc concat tensor memory size, output_mem_size = %ld, out of data range [0, %u]", - output_mem_size, UINT32_MAX); + if (output_mem_size < 0) { + GELOGE(GRAPH_FAILED, "After calc concat tensor memory size, output_mem_size = %ld, out of data range [0, %ld]", + output_mem_size, INT64_MAX); return GRAPH_FAILED; } - size_temp = static_cast(output_mem_size); + size_temp = output_mem_size; return GRAPH_SUCCESS; } } // namespace ge diff --git a/src/common/graph/utils/type_utils.cc b/src/common/graph/utils/type_utils.cc index 61b57d80..cd316260 100644 --- a/src/common/graph/utils/type_utils.cc +++ b/src/common/graph/utils/type_utils.cc @@ -19,43 +19,45 @@ namespace ge { static const std::map kFormatToStringMap = { - {FORMAT_NCHW, "NCHW"}, - {FORMAT_NHWC, "NHWC"}, - {FORMAT_ND, "ND"}, - {FORMAT_NC1HWC0, "NC1HWC0"}, - {FORMAT_FRACTAL_Z, "FRACTAL_Z"}, - {FORMAT_NC1C0HWPAD, "NC1C0HWPAD"}, - {FORMAT_NHWC1C0, "NHWC1C0"}, - {FORMAT_FSR_NCHW, "FSR_NCHW"}, - {FORMAT_FRACTAL_DECONV, "FRACTAL_DECONV"}, - {FORMAT_C1HWNC0, "C1HWNC0"}, - {FORMAT_FRACTAL_DECONV_TRANSPOSE, "FRACTAL_DECONV_TRANSPOSE"}, - {FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS, "FRACTAL_DECONV_SP_STRIDE_TRANS"}, - {FORMAT_NC1HWC0_C04, "NC1HWC0_C04"}, - {FORMAT_FRACTAL_Z_C04, "FRACTAL_Z_C04"}, - {FORMAT_CHWN, "CHWN"}, - {FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS, "DECONV_SP_STRIDE8_TRANS"}, - {FORMAT_NC1KHKWHWC0, "NC1KHKWHWC0"}, - {FORMAT_BN_WEIGHT, "BN_WEIGHT"}, - {FORMAT_FILTER_HWCK, "FILTER_HWCK"}, - {FORMAT_HWCN, "HWCN"}, - {FORMAT_HASHTABLE_LOOKUP_LOOKUPS, "LOOKUP_LOOKUPS"}, - {FORMAT_HASHTABLE_LOOKUP_KEYS, "LOOKUP_KEYS"}, - {FORMAT_HASHTABLE_LOOKUP_VALUE, "LOOKUP_VALUE"}, - {FORMAT_HASHTABLE_LOOKUP_OUTPUT, "LOOKUP_OUTPUT"}, - {FORMAT_HASHTABLE_LOOKUP_HITS, "LOOKUP_HITS"}, - {FORMAT_MD, "MD"}, - {FORMAT_NDHWC, "NDHWC"}, - {FORMAT_NCDHW, "NCDHW"}, - {FORMAT_DHWCK, "DHWCK"}, - {FORMAT_NDC1HWC0, "NDC1HWC0"}, - {FORMAT_FRACTAL_Z_3D, "FRACTAL_Z_3D"}, - {FORMAT_C1HWNCoC0, "C1HWNCoC0"}, - {FORMAT_FRACTAL_NZ, "FRACTAL_NZ"}, - {FORMAT_CN, "CN"}, - {FORMAT_NC, "NC"}, - {FORMAT_RESERVED, "FORMAT_RESERVED"}, - {FORMAT_ALL, "ALL"}}; + {FORMAT_NCHW, "NCHW"}, + {FORMAT_NHWC, "NHWC"}, + {FORMAT_ND, "ND"}, + {FORMAT_NC1HWC0, "NC1HWC0"}, + {FORMAT_FRACTAL_Z, "FRACTAL_Z"}, + {FORMAT_NC1C0HWPAD, "NC1C0HWPAD"}, + {FORMAT_NHWC1C0, "NHWC1C0"}, + {FORMAT_FSR_NCHW, "FSR_NCHW"}, + {FORMAT_FRACTAL_DECONV, "FRACTAL_DECONV"}, + {FORMAT_C1HWNC0, "C1HWNC0"}, + {FORMAT_FRACTAL_DECONV_TRANSPOSE, "FRACTAL_DECONV_TRANSPOSE"}, + {FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS, "FRACTAL_DECONV_SP_STRIDE_TRANS"}, + {FORMAT_NC1HWC0_C04, "NC1HWC0_C04"}, + {FORMAT_FRACTAL_Z_C04, "FRACTAL_Z_C04"}, + {FORMAT_CHWN, "CHWN"}, + {FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS, "DECONV_SP_STRIDE8_TRANS"}, + {FORMAT_NC1KHKWHWC0, "NC1KHKWHWC0"}, + {FORMAT_BN_WEIGHT, "BN_WEIGHT"}, + {FORMAT_FILTER_HWCK, "FILTER_HWCK"}, + {FORMAT_HWCN, "HWCN"}, + {FORMAT_HASHTABLE_LOOKUP_LOOKUPS, "LOOKUP_LOOKUPS"}, + {FORMAT_HASHTABLE_LOOKUP_KEYS, "LOOKUP_KEYS"}, + {FORMAT_HASHTABLE_LOOKUP_VALUE, "LOOKUP_VALUE"}, + {FORMAT_HASHTABLE_LOOKUP_OUTPUT, "LOOKUP_OUTPUT"}, + {FORMAT_HASHTABLE_LOOKUP_HITS, "LOOKUP_HITS"}, + {FORMAT_MD, "MD"}, + {FORMAT_NDHWC, "NDHWC"}, + {FORMAT_NCDHW, "NCDHW"}, + {FORMAT_DHWCN, "DHWCN"}, + {FORMAT_DHWNC, "DHWNC"}, + {FORMAT_NDC1HWC0, "NDC1HWC0"}, + {FORMAT_FRACTAL_Z_3D, "FRACTAL_Z_3D"}, + {FORMAT_FRACTAL_Z_3D_TRANSPOSE, "FRACTAL_Z_3D_TRANSPOSE"}, + {FORMAT_C1HWNCoC0, "C1HWNCoC0"}, + {FORMAT_FRACTAL_NZ, "FRACTAL_NZ"}, + {FORMAT_CN, "CN"}, + {FORMAT_NC, "NC"}, + {FORMAT_RESERVED, "FORMAT_RESERVED"}, + {FORMAT_ALL, "ALL"}}; static const std::unordered_set kInternalFormat = {"NC1HWC0", "FRACTAL_Z", @@ -73,137 +75,140 @@ static const std::unordered_set kInternalFormat = {"NC1HWC0", "FRACTAL_ZZ", "FRACTAL_NZ", "NDC1HWC0", - "FORMAT_FRACTAL_Z_3D"}; + "FORMAT_FRACTAL_Z_3D", + "FORMAT_FRACTAL_Z_3D_TRANSPOSE"}; static const std::map kDataFormatMap = { - {"NCHW", FORMAT_NCHW}, {"NHWC", FORMAT_NHWC}, {"ND", FORMAT_ND}}; + {"NCHW", FORMAT_NCHW}, {"NHWC", FORMAT_NHWC}, {"NDHWC", FORMAT_NDHWC}, {"NCDHW", FORMAT_NCDHW}, {"ND", FORMAT_ND}}; static const std::map kStringToFormatMap = { - {"NCHW", FORMAT_NCHW}, - {"NHWC", FORMAT_NHWC}, - {"ND", FORMAT_ND}, - {"NC1HWC0", FORMAT_NC1HWC0}, - {"FRACTAL_Z", FORMAT_FRACTAL_Z}, - {"NC1C0HWPAD", FORMAT_NC1C0HWPAD}, - {"NHWC1C0", FORMAT_NHWC1C0}, - {"FSR_NCHW", FORMAT_FSR_NCHW}, - {"FRACTAL_DECONV", FORMAT_FRACTAL_DECONV}, - {"C1HWNC0", FORMAT_C1HWNC0}, - {"FRACTAL_DECONV_TRANSPOSE", FORMAT_FRACTAL_DECONV_TRANSPOSE}, - {"FRACTAL_DECONV_SP_STRIDE_TRANS", FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS}, - {"NC1HWC0_C04", FORMAT_NC1HWC0_C04}, - {"FRACTAL_Z_C04", FORMAT_FRACTAL_Z_C04}, - {"CHWN", FORMAT_CHWN}, - {"DECONV_SP_STRIDE8_TRANS", FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS}, - {"NC1KHKWHWC0", FORMAT_NC1KHKWHWC0}, - {"BN_WEIGHT", FORMAT_BN_WEIGHT}, - {"FILTER_HWCK", FORMAT_FILTER_HWCK}, - {"HWCN", FORMAT_HWCN}, - {"LOOKUP_LOOKUPS", FORMAT_HASHTABLE_LOOKUP_LOOKUPS}, - {"LOOKUP_KEYS", FORMAT_HASHTABLE_LOOKUP_KEYS}, - {"LOOKUP_VALUE", FORMAT_HASHTABLE_LOOKUP_VALUE}, - {"LOOKUP_OUTPUT", FORMAT_HASHTABLE_LOOKUP_OUTPUT}, - {"LOOKUP_HITS", FORMAT_HASHTABLE_LOOKUP_HITS}, - {"MD", FORMAT_MD}, - {"C1HWNCoC0", FORMAT_C1HWNCoC0}, - {"FRACTAL_NZ", FORMAT_FRACTAL_NZ}, - {"NDHWC", FORMAT_NDHWC}, - {"NCDHW", FORMAT_NCDHW}, - {"DHWCK", FORMAT_DHWCK}, - {"NDC1HWC0", FORMAT_NDC1HWC0}, - {"FRACTAL_Z_3D", FORMAT_FRACTAL_Z_3D}, - {"CN", FORMAT_CN}, - {"NC", FORMAT_NC}, - {"FORMAT_RESERVED", FORMAT_RESERVED}, - {"ALL", FORMAT_ALL}}; + {"NCHW", FORMAT_NCHW}, + {"NHWC", FORMAT_NHWC}, + {"ND", FORMAT_ND}, + {"NC1HWC0", FORMAT_NC1HWC0}, + {"FRACTAL_Z", FORMAT_FRACTAL_Z}, + {"NC1C0HWPAD", FORMAT_NC1C0HWPAD}, + {"NHWC1C0", FORMAT_NHWC1C0}, + {"FSR_NCHW", FORMAT_FSR_NCHW}, + {"FRACTAL_DECONV", FORMAT_FRACTAL_DECONV}, + {"C1HWNC0", FORMAT_C1HWNC0}, + {"FRACTAL_DECONV_TRANSPOSE", FORMAT_FRACTAL_DECONV_TRANSPOSE}, + {"FRACTAL_DECONV_SP_STRIDE_TRANS", FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS}, + {"NC1HWC0_C04", FORMAT_NC1HWC0_C04}, + {"FRACTAL_Z_C04", FORMAT_FRACTAL_Z_C04}, + {"CHWN", FORMAT_CHWN}, + {"DECONV_SP_STRIDE8_TRANS", FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS}, + {"NC1KHKWHWC0", FORMAT_NC1KHKWHWC0}, + {"BN_WEIGHT", FORMAT_BN_WEIGHT}, + {"FILTER_HWCK", FORMAT_FILTER_HWCK}, + {"HWCN", FORMAT_HWCN}, + {"LOOKUP_LOOKUPS", FORMAT_HASHTABLE_LOOKUP_LOOKUPS}, + {"LOOKUP_KEYS", FORMAT_HASHTABLE_LOOKUP_KEYS}, + {"LOOKUP_VALUE", FORMAT_HASHTABLE_LOOKUP_VALUE}, + {"LOOKUP_OUTPUT", FORMAT_HASHTABLE_LOOKUP_OUTPUT}, + {"LOOKUP_HITS", FORMAT_HASHTABLE_LOOKUP_HITS}, + {"MD", FORMAT_MD}, + {"C1HWNCoC0", FORMAT_C1HWNCoC0}, + {"FRACTAL_NZ", FORMAT_FRACTAL_NZ}, + {"NDHWC", FORMAT_NDHWC}, + {"NCDHW", FORMAT_NCDHW}, + {"DHWCN", FORMAT_DHWCN}, + {"DHWNC", FORMAT_DHWNC}, + {"NDC1HWC0", FORMAT_NDC1HWC0}, + {"FRACTAL_Z_3D", FORMAT_FRACTAL_Z_3D}, + {"FRACTAL_Z_3D_TRANSPOSE", FORMAT_FRACTAL_Z_3D_TRANSPOSE}, + {"CN", FORMAT_CN}, + {"NC", FORMAT_NC}, + {"FORMAT_RESERVED", FORMAT_RESERVED}, + {"ALL", FORMAT_ALL}}; static const std::map kDataTypeToStringMap = { - {DT_UNDEFINED, "DT_UNDEFINED"}, // Used to indicate a DataType field has not been set. - {DT_FLOAT, "DT_FLOAT"}, // float type - {DT_FLOAT16, "DT_FLOAT16"}, // fp16 type - {DT_INT8, "DT_INT8"}, // int8 type - {DT_INT16, "DT_INT16"}, // int16 type - {DT_UINT16, "DT_UINT16"}, // uint16 type - {DT_UINT8, "DT_UINT8"}, // uint8 type - {DT_INT32, "DT_INT32"}, // uint32 type - {DT_INT64, "DT_INT64"}, // int64 type - {DT_UINT32, "DT_UINT32"}, // unsigned int32 - {DT_UINT64, "DT_UINT64"}, // unsigned int64 - {DT_BOOL, "DT_BOOL"}, // bool type - {DT_DOUBLE, "DT_DOUBLE"}, // double type - {DT_DUAL, "DT_DUAL"}, // dual output type - {DT_DUAL_SUB_INT8, "DT_DUAL_SUB_INT8"}, // dual output int8 type - {DT_DUAL_SUB_UINT8, "DT_DUAL_SUB_UINT8"}, // dual output uint8 type - {DT_COMPLEX64, "DT_COMPLEX64"}, // complex64 type - {DT_COMPLEX128, "DT_COMPLEX128"}, // complex128 type - {DT_QINT8, "DT_QINT8"}, // qint8 type - {DT_QINT16, "DT_QINT16"}, // qint16 type - {DT_QINT32, "DT_QINT32"}, // qint32 type - {DT_QUINT8, "DT_QUINT8"}, // quint8 type - {DT_QUINT16, "DT_QUINT16"}, // quint16 type - {DT_RESOURCE, "DT_RESOURCE"}, // resource type - {DT_STRING_REF, "DT_STRING_REF"}, // string ref type - {DT_STRING, "DT_STRING"}, // string type + {DT_UNDEFINED, "DT_UNDEFINED"}, // Used to indicate a DataType field has not been set. + {DT_FLOAT, "DT_FLOAT"}, // float type + {DT_FLOAT16, "DT_FLOAT16"}, // fp16 type + {DT_INT8, "DT_INT8"}, // int8 type + {DT_INT16, "DT_INT16"}, // int16 type + {DT_UINT16, "DT_UINT16"}, // uint16 type + {DT_UINT8, "DT_UINT8"}, // uint8 type + {DT_INT32, "DT_INT32"}, // uint32 type + {DT_INT64, "DT_INT64"}, // int64 type + {DT_UINT32, "DT_UINT32"}, // unsigned int32 + {DT_UINT64, "DT_UINT64"}, // unsigned int64 + {DT_BOOL, "DT_BOOL"}, // bool type + {DT_DOUBLE, "DT_DOUBLE"}, // double type + {DT_DUAL, "DT_DUAL"}, // dual output type + {DT_DUAL_SUB_INT8, "DT_DUAL_SUB_INT8"}, // dual output int8 type + {DT_DUAL_SUB_UINT8, "DT_DUAL_SUB_UINT8"}, // dual output uint8 type + {DT_COMPLEX64, "DT_COMPLEX64"}, // complex64 type + {DT_COMPLEX128, "DT_COMPLEX128"}, // complex128 type + {DT_QINT8, "DT_QINT8"}, // qint8 type + {DT_QINT16, "DT_QINT16"}, // qint16 type + {DT_QINT32, "DT_QINT32"}, // qint32 type + {DT_QUINT8, "DT_QUINT8"}, // quint8 type + {DT_QUINT16, "DT_QUINT16"}, // quint16 type + {DT_RESOURCE, "DT_RESOURCE"}, // resource type + {DT_STRING_REF, "DT_STRING_REF"}, // string ref type + {DT_STRING, "DT_STRING"}, // string type }; static const std::map kStringTodataTypeMap = { - {"DT_UNDEFINED", DT_UNDEFINED}, // Used to indicate a DataType field has not been set. - {"DT_FLOAT", DT_FLOAT}, // float type - { - "DT_FLOAT16", - DT_FLOAT16, - }, // fp16 type - {"DT_INT8", DT_INT8}, // int8 type - {"DT_INT16", DT_INT16}, // int16 type - {"DT_UINT16", DT_UINT16}, // uint16 type - {"DT_UINT8", DT_UINT8}, // uint8 type - {"DT_INT32", DT_INT32}, // uint32 type - {"DT_INT64", DT_INT64}, // int64 type - {"DT_UINT32", DT_UINT32}, // unsigned int32 - {"DT_UINT64", DT_UINT64}, // unsigned int64 - {"DT_BOOL", DT_BOOL}, // bool type - {"DT_DOUBLE", DT_DOUBLE}, // double type - {"DT_DUAL", DT_DUAL}, // dual output type - {"DT_DUAL_SUB_INT8", DT_DUAL_SUB_INT8}, // dual output int8 type - {"DT_DUAL_SUB_UINT8", DT_DUAL_SUB_UINT8}, // dual output uint8 type - {"DT_COMPLEX64", DT_COMPLEX64}, // complex64 type - {"DT_COMPLEX128", DT_COMPLEX128}, // complex128 type - {"DT_QINT8", DT_QINT8}, // qint8 type - {"DT_QINT16", DT_QINT16}, // qint16 type - {"DT_QINT32", DT_QINT32}, // qint32 type - {"DT_QUINT8", DT_QUINT8}, // quint8 type - {"DT_QUINT16", DT_QUINT16}, // quint16 type - {"DT_RESOURCE", DT_RESOURCE}, // resource type - {"DT_STRING_REF", DT_STRING_REF}, // string ref type - {"DT_STRING", DT_STRING}, // string type + {"DT_UNDEFINED", DT_UNDEFINED}, // Used to indicate a DataType field has not been set. + {"DT_FLOAT", DT_FLOAT}, // float type + { + "DT_FLOAT16", + DT_FLOAT16, + }, // fp16 type + {"DT_INT8", DT_INT8}, // int8 type + {"DT_INT16", DT_INT16}, // int16 type + {"DT_UINT16", DT_UINT16}, // uint16 type + {"DT_UINT8", DT_UINT8}, // uint8 type + {"DT_INT32", DT_INT32}, // uint32 type + {"DT_INT64", DT_INT64}, // int64 type + {"DT_UINT32", DT_UINT32}, // unsigned int32 + {"DT_UINT64", DT_UINT64}, // unsigned int64 + {"DT_BOOL", DT_BOOL}, // bool type + {"DT_DOUBLE", DT_DOUBLE}, // double type + {"DT_DUAL", DT_DUAL}, // dual output type + {"DT_DUAL_SUB_INT8", DT_DUAL_SUB_INT8}, // dual output int8 type + {"DT_DUAL_SUB_UINT8", DT_DUAL_SUB_UINT8}, // dual output uint8 type + {"DT_COMPLEX64", DT_COMPLEX64}, // complex64 type + {"DT_COMPLEX128", DT_COMPLEX128}, // complex128 type + {"DT_QINT8", DT_QINT8}, // qint8 type + {"DT_QINT16", DT_QINT16}, // qint16 type + {"DT_QINT32", DT_QINT32}, // qint32 type + {"DT_QUINT8", DT_QUINT8}, // quint8 type + {"DT_QUINT16", DT_QUINT16}, // quint16 type + {"DT_RESOURCE", DT_RESOURCE}, // resource type + {"DT_STRING_REF", DT_STRING_REF}, // string ref type + {"DT_STRING", DT_STRING}, // string type }; static const std::map kDataTypeToLength = { - {DT_BOOL, sizeof(bool)}, - {DT_INT64, sizeof(int64_t)}, - {DT_UINT64, sizeof(int64_t)}, - {DT_FLOAT, sizeof(float)}, - {DT_INT32, sizeof(int32_t)}, - {DT_UINT32, sizeof(int32_t)}, - {DT_INT8, sizeof(char)}, - {DT_UINT8, sizeof(char)}, - {DT_INT16, sizeof(int16_t)}, - {DT_UINT16, sizeof(int16_t)}, - {DT_FLOAT16, sizeof(int16_t)}, - {DT_DOUBLE, sizeof(double)}, - {DT_DUAL, sizeof(float) + sizeof(int8_t)}, - {DT_DUAL_SUB_INT8, sizeof(int8_t)}, - {DT_DUAL_SUB_UINT8, sizeof(uint8_t)}, - {DT_COMPLEX64, sizeof(int64_t)}, - {DT_COMPLEX128, sizeof(int64_t) * 2}, - {DT_QINT8, sizeof(int8_t)}, - {DT_QINT16, sizeof(int16_t)}, - {DT_QINT32, sizeof(int32_t)}, - {DT_QUINT8, sizeof(uint8_t)}, - {DT_QUINT16, sizeof(uint16_t)}, - {DT_STRING_REF, sizeof(uint64_t) * 2}, - {DT_STRING, sizeof(uint64_t)}, - {DT_RESOURCE, sizeof(uint64_t)}, + {DT_BOOL, sizeof(bool)}, + {DT_INT64, sizeof(int64_t)}, + {DT_UINT64, sizeof(int64_t)}, + {DT_FLOAT, sizeof(float)}, + {DT_INT32, sizeof(int32_t)}, + {DT_UINT32, sizeof(int32_t)}, + {DT_INT8, sizeof(char)}, + {DT_UINT8, sizeof(char)}, + {DT_INT16, sizeof(int16_t)}, + {DT_UINT16, sizeof(int16_t)}, + {DT_FLOAT16, sizeof(int16_t)}, + {DT_DOUBLE, sizeof(double)}, + {DT_DUAL, sizeof(float) + sizeof(int8_t)}, + {DT_DUAL_SUB_INT8, sizeof(int8_t)}, + {DT_DUAL_SUB_UINT8, sizeof(uint8_t)}, + {DT_COMPLEX64, sizeof(int64_t)}, + {DT_COMPLEX128, sizeof(int64_t) * 2}, + {DT_QINT8, sizeof(int8_t)}, + {DT_QINT16, sizeof(int16_t)}, + {DT_QINT32, sizeof(int32_t)}, + {DT_QUINT8, sizeof(uint8_t)}, + {DT_QUINT16, sizeof(uint16_t)}, + {DT_STRING_REF, sizeof(uint64_t) * 2}, + {DT_STRING, sizeof(uint64_t)}, + {DT_RESOURCE, sizeof(uint64_t)}, }; bool TypeUtils::IsDataTypeValid(DataType dt) { diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt index 4a4cf5cb..56e5e2b0 100755 --- a/src/ge/CMakeLists.txt +++ b/src/ge/CMakeLists.txt @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================ -# libge.so & libge_train.so +# libge_compiler.so & libge_train.so # will later be integrated into libgraph_runner.so, works for both training and inference # compiling proto files generates some warnings, use no-unused-variable to suppress them set(CMAKE_CXX_FLAGS "-Wno-unused-variable ${CMAKE_CXX_FLAGS}") @@ -49,7 +49,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge) ######### libge_train.so ############# # need to remove dependencies on pb files later -file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} +file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "common/formats/format_transfers/*.cc" "common/formats/formats.cc" "common/formats/utils/formats_trans_utils.cc" @@ -57,20 +57,24 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "common/ge/plugin_manager.cc" "common/profiling/profiling_manager.cc" "engine_manager/dnnengine_manager.cc" + "ge_local_engine/engine/host_cpu_engine.cc" "generator/ge_generator.cc" "generator/generator_api.cc" - "graph/build/graph_build.cc" + "graph/build/graph_builder.cc" + "graph/build/label_allocator.cc" "graph/build/logical_stream_allocator.cc" "graph/build/model_builder.cc" - "graph/build/optimize_stream_graph.cc" "graph/build/run_context.cc" "graph/build/stream_allocator.cc" + "graph/build/stream_graph_optimizer.cc" "graph/build/task_generator.cc" "graph/common/bcast.cc" "graph/common/omg_util.cc" "graph/common/transop_util.cc" "graph/execute/graph_execute.cc" + "graph/label/*.cc" "graph/load/graph_loader.cc" + "graph/load/new_model_manager/cpu_queue_schedule.cc" "graph/load/new_model_manager/data_dumper.cc" "graph/load/new_model_manager/data_inputer.cc" "graph/load/new_model_manager/davinci_model.cc" @@ -92,10 +96,12 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "graph/load/new_model_manager/task_info/stream_active_task_info.cc" "graph/load/new_model_manager/task_info/stream_switch_task_info.cc" + "graph/load/new_model_manager/task_info/stream_switchn_task_info.cc" + "graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc" + "graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc" "graph/load/new_model_manager/task_info/task_info.cc" "graph/load/new_model_manager/tbe_handle_store.cc" "graph/load/output/output.cc" - "graph/manager/custom/custom_op.cc" "graph/manager/graph_context.cc" "graph/manager/graph_manager.cc" "graph/manager/graph_manager_utils.cc" @@ -105,12 +111,9 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/manager/trans_var_data_utils.cc" "graph/manager/util/debug.cc" "graph/manager/util/hcom_util.cc" - "graph/manager/util/node_searcher/need_rebuild_node_searcher.cc" "graph/manager/util/rt_context_util.cc" "graph/manager/util/variable_accelerate_ctrl.cc" - "graph/optimize/graph_functiondef.cc" "graph/optimize/graph_optimize.cc" - "graph/optimize/graph_optimizer.cc" "graph/optimize/optimizer/allreduce_fusion_pass.cc" "graph/optimize/summary_optimize.cc" "graph/partition/engine_place.cc" @@ -120,7 +123,9 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/assert_pass.cc" "graph/passes/atomic_addr_clean_pass.cc" "graph/passes/base_pass.cc" + "graph/passes/cast_remove_pass.cc" "graph/passes/cast_translate_pass.cc" + "graph/passes/common_subexpression_elimination_pass.cc" "graph/passes/compile_nodes_pass.cc" "graph/passes/constant_folding_pass.cc" "graph/passes/constant_fuse_same_pass.cc" @@ -159,12 +164,14 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/folding_kernel/shape_kernel.cc" "graph/passes/folding_kernel/shape_n_kernel.cc" "graph/passes/folding_kernel/size_kernel.cc" + "graph/passes/folding_kernel/slice_d_kernel.cc" "graph/passes/folding_kernel/slice_kernel.cc" "graph/passes/folding_kernel/squeeze_kernel.cc" "graph/passes/folding_kernel/ssd_prior_box_kernel.cc" "graph/passes/folding_kernel/strided_slice_kernel.cc" "graph/passes/folding_kernel/sub_kernel.cc" "graph/passes/folding_kernel/transdata_kernel.cc" + "graph/passes/folding_kernel/unpack_kernel.cc" "graph/passes/folding_pass.cc" "graph/passes/get_original_format_pass.cc" "graph/passes/guarantee_const_pass.cc" @@ -179,7 +186,6 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/multi_batch_pass.cc" "graph/passes/net_output_pass.cc" "graph/passes/next_iteration_pass.cc" - "graph/passes/no_reshape_op_remove_pass.cc" "graph/passes/no_use_reshape_remove_pass.cc" "graph/passes/pass_manager.cc" "graph/passes/pass_utils.cc" @@ -188,6 +194,7 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/prevent_gradient_pass.cc" "graph/passes/print_op_pass.cc" "graph/passes/prune_pass.cc" + "graph/passes/replace_with_empty_const_pass.cc" "graph/passes/reshape_remove_pass.cc" "graph/passes/resource_pair_add_control_pass.cc" "graph/passes/resource_pair_remove_control_pass.cc" @@ -206,14 +213,12 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/transpose_transdata_pass.cc" "graph/passes/unused_const_pass.cc" "graph/passes/unused_op_remove_pass.cc" - "graph/passes/update_net_output_pass.cc" "graph/passes/var_is_initialized_op_pass.cc" "graph/passes/variable_format_pass.cc" "graph/passes/variable_op_pass.cc" "graph/passes/variable_prepare_op_pass.cc" "graph/passes/variable_ref_delete_op_pass.cc" "graph/preprocess/graph_preprocess.cc" - "graph/preprocess/insert_op/base_insert_op.cc" "graph/preprocess/insert_op/ge_aipp_op.cc" "graph/preprocess/insert_op/util_insert_aipp_op.cc" "graph/preprocess/multi_batch_copy_graph.cc" @@ -223,13 +228,8 @@ file(GLOB_RECURSE TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "opskernel_manager/ops_kernel_manager.cc" "session/inner_session.cc" "session/session_manager.cc" - "single_op/single_op.cc" - "single_op/single_op_manager.cc" - "single_op/single_op_model.cc" - "single_op/stream_resource.cc" - "single_op/task/build_task_utils.cc" - "single_op/task/op_task.cc" - "single_op/task/tbe_task_builder.cc" + "single_op/*.cc" + "single_op/task/*.cc" ) @@ -261,9 +261,9 @@ target_link_libraries(ge_train rt dl) -######### libge.so ############# +######### libge_compiler.so ############# # need to remove dependencies on pb files later -file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} +file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "common/formats/format_transfers/*.cc" "common/formats/formats.cc" "common/formats/utils/formats_trans_utils.cc" @@ -271,20 +271,24 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "common/ge/plugin_manager.cc" "common/profiling/profiling_manager.cc" "engine_manager/dnnengine_manager.cc" + "ge_local_engine/engine/host_cpu_engine.cc" "generator/ge_generator.cc" "generator/generator_api.cc" - "graph/build/graph_build.cc" + "graph/build/graph_builder.cc" + "graph/build/label_allocator.cc" "graph/build/logical_stream_allocator.cc" "graph/build/model_builder.cc" - "graph/build/optimize_stream_graph.cc" "graph/build/run_context.cc" "graph/build/stream_allocator.cc" + "graph/build/stream_graph_optimizer.cc" "graph/build/task_generator.cc" "graph/common/bcast.cc" "graph/common/omg_util.cc" "graph/common/transop_util.cc" "graph/execute/graph_execute.cc" + "graph/label/*.cc" "graph/load/graph_loader.cc" + "graph/load/new_model_manager/cpu_queue_schedule.cc" "graph/load/new_model_manager/data_dumper.cc" "graph/load/new_model_manager/data_inputer.cc" "graph/load/new_model_manager/davinci_model.cc" @@ -305,10 +309,12 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "graph/load/new_model_manager/task_info/stream_active_task_info.cc" "graph/load/new_model_manager/task_info/stream_switch_task_info.cc" + "graph/load/new_model_manager/task_info/stream_switchn_task_info.cc" + "graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc" + "graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc" "graph/load/new_model_manager/task_info/task_info.cc" "graph/load/new_model_manager/tbe_handle_store.cc" "graph/load/output/output.cc" - "graph/manager/custom/custom_op.cc" "graph/manager/graph_context.cc" "graph/manager/graph_manager.cc" "graph/manager/graph_manager_utils.cc" @@ -317,13 +323,9 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/manager/model_manager/event_manager.cc" "graph/manager/trans_var_data_utils.cc" "graph/manager/util/debug.cc" - "graph/manager/util/node_searcher/need_rebuild_node_searcher.cc" "graph/manager/util/rt_context_util.cc" "graph/manager/util/variable_accelerate_ctrl.cc" - "graph/optimize/graph_functiondef.cc" "graph/optimize/graph_optimize.cc" - "graph/optimize/graph_optimizer.cc" - "graph/optimize/optimizer/allreduce_fusion_inference_pass.cc" "graph/optimize/summary_optimize.cc" "graph/partition/engine_place.cc" "graph/partition/graph_partition.cc" @@ -332,7 +334,9 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/assert_pass.cc" "graph/passes/atomic_addr_clean_pass.cc" "graph/passes/base_pass.cc" + "graph/passes/cast_remove_pass.cc" "graph/passes/cast_translate_pass.cc" + "graph/passes/common_subexpression_elimination_pass.cc" "graph/passes/compile_nodes_pass.cc" "graph/passes/constant_folding_pass.cc" "graph/passes/constant_fuse_same_pass.cc" @@ -371,12 +375,14 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/folding_kernel/shape_kernel.cc" "graph/passes/folding_kernel/shape_n_kernel.cc" "graph/passes/folding_kernel/size_kernel.cc" + "graph/passes/folding_kernel/slice_d_kernel.cc" "graph/passes/folding_kernel/slice_kernel.cc" "graph/passes/folding_kernel/squeeze_kernel.cc" "graph/passes/folding_kernel/ssd_prior_box_kernel.cc" "graph/passes/folding_kernel/strided_slice_kernel.cc" "graph/passes/folding_kernel/sub_kernel.cc" "graph/passes/folding_kernel/transdata_kernel.cc" + "graph/passes/folding_kernel/unpack_kernel.cc" "graph/passes/folding_pass.cc" "graph/passes/get_original_format_pass.cc" "graph/passes/guarantee_const_pass.cc" @@ -391,7 +397,6 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/multi_batch_pass.cc" "graph/passes/net_output_pass.cc" "graph/passes/next_iteration_pass.cc" - "graph/passes/no_reshape_op_remove_pass.cc" "graph/passes/no_use_reshape_remove_pass.cc" "graph/passes/pass_manager.cc" "graph/passes/pass_utils.cc" @@ -400,6 +405,7 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/prevent_gradient_pass.cc" "graph/passes/print_op_pass.cc" "graph/passes/prune_pass.cc" + "graph/passes/replace_with_empty_const_pass.cc" "graph/passes/reshape_remove_pass.cc" "graph/passes/resource_pair_add_control_pass.cc" "graph/passes/resource_pair_remove_control_pass.cc" @@ -418,14 +424,12 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/passes/transpose_transdata_pass.cc" "graph/passes/unused_const_pass.cc" "graph/passes/unused_op_remove_pass.cc" - "graph/passes/update_net_output_pass.cc" "graph/passes/var_is_initialized_op_pass.cc" "graph/passes/variable_format_pass.cc" "graph/passes/variable_op_pass.cc" "graph/passes/variable_prepare_op_pass.cc" "graph/passes/variable_ref_delete_op_pass.cc" "graph/preprocess/graph_preprocess.cc" - "graph/preprocess/insert_op/base_insert_op.cc" "graph/preprocess/insert_op/ge_aipp_op.cc" "graph/preprocess/insert_op/util_insert_aipp_op.cc" "graph/preprocess/multi_batch_copy_graph.cc" @@ -442,16 +446,19 @@ file(GLOB_RECURSE INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "single_op/task/build_task_utils.cc" "single_op/task/op_task.cc" "single_op/task/tbe_task_builder.cc" +########################################## +# "ir_build/ge_ir_build.cc" +# "offline/atc_ir_common.cc" ) -add_library(ge SHARED ${INFER_SRC_LIST} ${PROTO_SRCS} ${PROTO_HEADER_HDRS}) -target_compile_definitions(ge PRIVATE +add_library(ge_compiler SHARED ${INFER_SRC_LIST} ${PROTO_SRCS} ${PROTO_HEADER_HDRS}) +target_compile_definitions(ge_compiler PRIVATE PROTOBUF_INLINE_NOT_IN_HEADERS=0 DAVINCI_SUPPORT_PROFILING REUSE_MEMORY=1 FMK_HOST_INFER PLATFORM_CLOUD) -target_link_libraries(ge +target_link_libraries(ge_compiler graph ge_common "-Wl,--whole-archive" diff --git a/src/ge/client/CMakeLists.txt b/src/ge/client/CMakeLists.txt index 353d62fe..c1111d8e 100755 --- a/src/ge/client/CMakeLists.txt +++ b/src/ge/client/CMakeLists.txt @@ -80,7 +80,7 @@ target_compile_definitions(ge_client_train PRIVATE PLATFORM_CLOUD) target_link_libraries(ge_client graph - ge + ge_compiler ge_common ${PROTOBUF_LIBRARY} ${register} diff --git a/src/ge/client/ge_api.cc b/src/ge/client/ge_api.cc index 44dd9239..679b155b 100644 --- a/src/ge/client/ge_api.cc +++ b/src/ge/client/ge_api.cc @@ -61,14 +61,14 @@ Status CheckDumpAndReuseMemory(const std::map &options) { const int kDecimal = 10; auto dump_op_env = std::getenv("DUMP_OP"); int dump_op_flag = (dump_op_env != nullptr) ? std::strtol(dump_op_env, nullptr, kDecimal) : 0; - auto disable_reuse_memory_iter = options.find("ge.exec.disableReuseMemory"); - if (disable_reuse_memory_iter != options.end()) { - if (disable_reuse_memory_iter->second == "0") { + auto disableReuseMemoryIter = options.find("ge.exec.disableReuseMemory"); + if (disableReuseMemoryIter != options.end()) { + if (disableReuseMemoryIter->second == "0") { GELOGD("ge.exec.disableReuseMemory=0, reuse memory is open"); if (dump_op_flag) { GELOGW("Will dump incorrect op data with GE Option ge.exec.disableReuseMemory=0"); } - } else if (disable_reuse_memory_iter->second == "1") { + } else if (disableReuseMemoryIter->second == "1") { GELOGD("ge.exec.disableReuseMemory=1, reuse memory is close"); } else { GELOGE(PARAM_INVALID, "CheckDumpAndReuseMemory ge.exec.disableReuseMemory is valid"); @@ -128,22 +128,29 @@ Status GEInitialize(const std::map &options) { OpsProtoManager *manager = OpsProtoManager::Instance(); std::map option_tmp; option_tmp.emplace(std::pair(string("ge.opsProtoLibPath"), opsproto_path)); + GE_TIMESTAMP_START(GEInitialize); bool is_proto_init = manager->Initialize(option_tmp); + GE_TIMESTAMP_END(GEInitialize, "GEInitialize::ManagerInitialize"); if (!is_proto_init) { GELOGE(GE_CLI_INIT_FAILED, "geInitialize failed, ops proto path is invalid."); return FAILED; } // check options is valid + GE_TIMESTAMP_START(CheckOptionsValid); if (CheckOptionsValid(options) != SUCCESS) { return FAILED; } + GE_TIMESTAMP_END(CheckOptionsValid, "GEInitialize::CheckOptionsValid"); + GE_TIMESTAMP_START(InitPreparation); SaveDdkVersion(options); - + GE_TIMESTAMP_END(InitPreparation, "GEInitialize::InitPreparation"); // call Initialize GELOGT(TRACE_RUNNING, "Initializing environment"); + GE_TIMESTAMP_START(GELibInitialize); Status ret = ge::GELib::Initialize(options); + GE_TIMESTAMP_END(GELibInitialize, "GEInitialize::GELibInitialize"); if (ret != SUCCESS) { GELOGE(GE_CLI_INIT_FAILED, "geInitialize failed, error code = %u", ret); return FAILED; @@ -170,17 +177,20 @@ Status GEFinalize() { std::lock_guard lock(kGeReleaseMutex); // call Finalize + Status ret = SUCCESS; + Status middle_ret; GELOGT(TRACE_RUNNING, "Finalizing environment"); - std::shared_ptr instance_ptr = ge::GELib::GetInstance(); - if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GEFinalize Failed: GE not initialized"); - return GE_CLI_GE_NOT_INITIALIZED; - } - Status ret = instance_ptr->Finalize(); - GELOGI("GEFinalize finalize gelib ret=%u", ret); - if (ret != SUCCESS) { - GELOGE(ret, "GEFinalize Failed"); - return FAILED; + std::shared_ptr instancePtr = ge::GELib::GetInstance(); + if (instancePtr == nullptr || !instancePtr->InitFlag()) { + GELOGW("GEFinalize Failed: GE not initialized."); + ret = GE_CLI_GE_NOT_INITIALIZED; + } + if (ret != GE_CLI_GE_NOT_INITIALIZED) { + middle_ret = instancePtr->Finalize(); + GELOGI("GEFinalize finalize gelib ret=%u", middle_ret); + if (middle_ret != SUCCESS) { + ret = middle_ret; + } } if (kGeInitialized && ret == SUCCESS) { @@ -379,8 +389,6 @@ Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, s } Status Session::RegisterCallBackFunc(const std::string &key, const pCallBackFunc &callback) { - GELOGW( - "The callback function will not be checked. Please ensure that the implementation of the function is trusted."); return ge::GELib::GetInstance()->SessionManagerObj().RegisterCallBackFunc(sessionId_, key, callback); } diff --git a/src/ge/common/CMakeLists.txt b/src/ge/common/CMakeLists.txt index 56a40b78..f458d87e 100755 --- a/src/ge/common/CMakeLists.txt +++ b/src/ge/common/CMakeLists.txt @@ -30,6 +30,8 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "formats/format_transfers/datatype_transfer.cc" "formats/format_transfers/format_transfer.cc" "formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc" + "formats/format_transfers/format_transfer_dhwcn_fracz3D.cc" + "formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc" "formats/format_transfers/format_transfer_fractal_nz.cc" "formats/format_transfers/format_transfer_fractal_z.cc" "formats/format_transfers/format_transfer_fractal_zz.cc" @@ -39,6 +41,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc" "formats/format_transfers/format_transfer_nc1hwc0_nchw.cc" "formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc" + "formats/format_transfers/format_transfer_nchw_fz_c04.cc" "formats/format_transfers/format_transfer_nchw_nc1hwc0.cc" "formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc" "formats/format_transfers/format_transfer_transpose.cc" @@ -50,6 +53,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "ge_format_util.cc" "helper/model_helper.cc" "helper/om_file_helper.cc" + "math/fp16_math.cc" "model_parser/base.cc" "op/attr_value_util.cc" "op/ge_op_utils.cc" @@ -59,9 +63,8 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "types.cc" "util.cc" "model_saver.cc" -# new files, possibly to be deleted? - "op/attr_value_util.cc" - "op/ge_op_utils.cc" + ############################### + "op/attr_define.cc" ) ge_protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) diff --git a/src/ge/common/auth/file_saver.cc b/src/ge/common/auth/file_saver.cc index fde9ce87..daa19448 100644 --- a/src/ge/common/auth/file_saver.cc +++ b/src/ge/common/auth/file_saver.cc @@ -23,15 +23,15 @@ #include #include +#include "common/math/math_util.h" #include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" #include "framework/common/util.h" +using ge::ModelBufferData; + namespace { const int kFileOpSuccess = 0; -const char TEE_PASSCODE_FILE_SUFFIX[] = ".PASSCODE"; -const char TEE_DAVINCI_FILE_SUFFIX[] = ".om"; -const size_t TEE_DAVINCI_FILE_SUFFIX_SIZE = 3; } // namespace namespace ge { @@ -42,15 +42,15 @@ Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) { } char real_path[PATH_MAX] = {0}; - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(file_path.length() >= PATH_MAX, return FAILED, "File path is too long!"); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(file_path.length() >= PATH_MAX, return FAILED, "File path is longer than PATH_MAX!"); GE_IF_BOOL_EXEC(realpath(file_path.c_str(), real_path) == nullptr, - GELOGI("File %s does not exit, it will be created.", file_path.c_str())); + GELOGI("File %s is not exit, it will be created.", file_path.c_str())); // Open file mode_t mode = S_IRUSR | S_IWUSR; fd = mmOpen2(real_path, O_RDWR | O_CREAT | O_TRUNC, mode); if (fd == EN_INVALID_PARAM || fd == EN_ERROR) { // -1: Failed to open file; - 2: Illegal parameter - GELOGE(FAILED, "Open file failed. mmpa_errno = %d", fd); + GELOGE(FAILED, "Open file failed. mmpa_errno = %d, %s", fd, strerror(errno)); return FAILED; } return SUCCESS; @@ -63,7 +63,7 @@ Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) { int32_t write_count = mmWrite(fd, const_cast(data), size); // -1: Failed to write to file; - 2: Illegal parameter if (write_count == EN_INVALID_PARAM || write_count == EN_ERROR) { - GELOGE(FAILED, "Write data failed. mmpa_errorno = %d", write_count); + GELOGE(FAILED, "Write data failed. mmpa_errorno = %d, %s", write_count, strerror(errno)); return FAILED; } @@ -100,10 +100,11 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header, ModelPartitionTable &model_partition_table, + const std::vector &partition_datas) { GE_CHK_BOOL_RET_STATUS( !partition_datas.empty() && model_partition_table.num != 0 && model_partition_table.num == partition_datas.size(), - FAILED, "Invalid param:partition data size(%u), model_partition_table.num(%zu).", model_partition_table.num, + FAILED, "Invalid param:partition data size is (%u), model_partition_table.num is (%zu).", model_partition_table.num, partition_datas.size()); // Open file int32_t fd = 0; @@ -118,9 +119,9 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( WriteData(static_cast(&model_partition_table), table_size, fd) != SUCCESS, ret = FAILED; break); // Write partition data - for (const auto &partition_data : partition_datas) { + for (const auto &partitionData : partition_datas) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( - WriteData(static_cast(partition_data.data), partition_data.size, fd) != SUCCESS, ret = FAILED; + WriteData(static_cast(partitionData.data), partitionData.size, fd) != SUCCESS, ret = FAILED; break); } } while (0); @@ -129,6 +130,52 @@ Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFi return ret; } +Status FileSaver::SaveToBuffWithFileHeader(const ModelFileHeader &file_header, + ModelPartitionTable &model_partition_table, + const std::vector &partitionDatas, + ge::ModelBufferData &model) { + GE_CHK_BOOL_RET_STATUS( + !partitionDatas.empty() && model_partition_table.num != 0 && model_partition_table.num == partitionDatas.size(), + FAILED, "Invalid param:partition data size is (%u), model_partition_table.num is (%zu).", model_partition_table.num, + partitionDatas.size()); + uint32_t model_header_size = sizeof(ModelFileHeader); + uint32_t table_size = static_cast(SIZE_OF_MODEL_PARTITION_TABLE(model_partition_table)); + uint32_t total_size = model_header_size + table_size; + + for (const auto &partitionData : partitionDatas) { + auto ret = ge::CheckUint32AddOverflow(total_size, partitionData.size); + GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "add uint32 overflow!"); + total_size = total_size + partitionData.size; + } + auto buff = reinterpret_cast(malloc(total_size)); + GE_CHK_BOOL_RET_STATUS(buff != nullptr, FAILED, "malloc failed!"); + GE_PRINT_DYNAMIC_MEMORY(malloc, "file buffer.", total_size) + model.data.reset(buff, [](uint8_t *buff) { + GELOGD("Free online model memory."); + free(buff); + buff = nullptr; + }); + model.length = total_size; + uint32_t left_space = total_size; + auto ret_mem1 = memcpy_s(buff, left_space, reinterpret_cast(const_cast(&file_header)), + model_header_size); + GE_CHK_BOOL_RET_STATUS(ret_mem1 == 0, FAILED, "memcpy_s failed!"); + buff += model_header_size; + left_space -= model_header_size; + auto ret_mem2 = memcpy_s(buff, left_space, reinterpret_cast(&model_partition_table), table_size); + GE_CHK_BOOL_RET_STATUS(ret_mem2 == 0, FAILED, "memcpy_s failed!"); + buff += table_size; + left_space -= table_size; + for (const auto &partitionData : partitionDatas) { + auto ret_mem3 = memcpy_s(buff, left_space, reinterpret_cast(const_cast(partitionData.data)), + partitionData.size); + GE_CHK_BOOL_RET_STATUS(ret_mem3 == 0, FAILED, "memcpy failed!"); + buff += partitionData.size; + left_space -= partitionData.size; + } + return SUCCESS; +} + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::CheckPath(const std::string &file_path) { // Determine file path length if (file_path.size() >= PATH_MAX) { @@ -171,7 +218,7 @@ FileSaver::SaveToFile(const string &file_path, const ge::ModelData &model, const int32_t copy_header_ret = 0; GE_IF_BOOL_EXEC(model_file_header != nullptr, copy_header_ret = memcpy_s(&file_header, sizeof(ModelFileHeader), model_file_header, sizeof(ModelFileHeader))); - GE_CHK_BOOL_RET_STATUS(copy_header_ret == 0, FAILED, "Copy ModelFileHeader failed! memcpy_s return: %d", + GE_CHK_BOOL_RET_STATUS(copy_header_ret == 0, FAILED, "Copy ModelFileHeader failed, memcpy_s return: %d", copy_header_ret); file_header.length = model.model_len; @@ -190,9 +237,34 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(const string &file_path, ModelFileHeader &file_header, ModelPartitionTable &model_partition_table, const std::vector &partition_datas) { file_header.is_encrypt = ModelEncryptType::UNENCRYPTED; + const Status ret = SaveWithFileHeader(file_path, file_header, model_partition_table, partition_datas); - GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "Save file failed, file_path:%s, file header len:%u.", + GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "save file failed, file_path:%s, file header len:%u.", file_path.c_str(), file_header.length); return SUCCESS; } + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(const string &file_path, const void *data, + int len) { + if (data == nullptr || len <= 0) { + GELOGE(FAILED, "Model_data is null or the length[%d] less than 1.", len); + return FAILED; + } + + // Open file + int32_t fd = 0; + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(OpenFile(fd, file_path) != SUCCESS, return FAILED, "OpenFile FAILED"); + + Status ret = SUCCESS; + + // write data + GE_CHK_BOOL_EXEC(SUCCESS == WriteData(data, (uint32_t)len, fd), ret = FAILED, "WriteData FAILED"); + + // Close file + if (mmClose(fd) != 0) { // mmClose 0: success + GELOGE(FAILED, "Close file failed."); + ret = FAILED; + } + return ret; +} } // namespace ge diff --git a/src/ge/common/auth/file_saver.h b/src/ge/common/auth/file_saver.h index b17c197d..d415746d 100644 --- a/src/ge/common/auth/file_saver.h +++ b/src/ge/common/auth/file_saver.h @@ -22,29 +22,30 @@ #include "framework/common/helper/om_file_helper.h" #include "framework/common/types.h" +#include "external/ge/ge_ir_build.h" #include "graph/buffer.h" #include "mmpa/mmpa_api.h" struct PROC_PARAM { uint8_t *model_name; - /* ISV Ek buffer */ + // ISV Ek buffer uint8_t *model_key; uint32_t model_key_len; - /* ISV root certificate buffer */ + // ISV root certificate buffer uint8_t *root_cert; uint32_t root_cert_len; - /* ISV private key buffer */ + // ISV private key buffer uint8_t *pri_key; uint32_t pri_key_len; - /* Raw AI Module Image buffer */ + // Raw AI Module Image buffer uint8_t *ai_image; uint32_t ai_image_len; - /* ISV HW key buffer */ + // ISV HW key buffer uint8_t *hw_key; uint32_t hw_key_len; }; @@ -61,11 +62,11 @@ using std::string; class FileSaver { public: - /** - * @ingroup domi_common - * @brief save model, no encryption - * @return Status result - */ + /// + /// @ingroup domi_common + /// @brief save model, no encryption + /// @return Status result + /// static Status SaveToFile(const string &file_path, const ge::ModelData &model, const ModelFileHeader *model_file_header = nullptr); @@ -73,27 +74,32 @@ class FileSaver { ModelPartitionTable &model_partition_table, const std::vector &partition_datas); + static Status SaveToBuffWithFileHeader(const ModelFileHeader &file_header, ModelPartitionTable &model_partition_table, + const std::vector &partitionDatas, ge::ModelBufferData &model); + + static Status SaveToFile(const string &file_path, const void *data, int len); + protected: - /** - * @ingroup domi_common - * @brief Check validity of the file path - * @return Status result - */ + /// + /// @ingroup domi_common + /// @brief Check validity of the file path + /// @return Status result + /// static Status CheckPath(const string &file_path); static Status WriteData(const void *data, uint32_t size, int32_t fd); static Status OpenFile(int32_t &fd, const std::string &file_path); - /** - * @ingroup domi_common - * @brief save model to file - * @param [in] file_path file output path - * @param [in] file_header file header info - * @param [in] data model data - * @param [in] len model length - * @return Status result - */ + /// + /// @ingroup domi_common + /// @brief save model to file + /// @param [in] file_path file output path + /// @param [in] file_header file header info + /// @param [in] data model data + /// @param [in] len model length + /// @return Status result + /// static Status SaveWithFileHeader(const string &file_path, const ModelFileHeader &file_header, const void *data, int len); diff --git a/src/ge/common/convert/pb2json.cc b/src/ge/common/convert/pb2json.cc index 2c35a856..7c53968a 100644 --- a/src/ge/common/convert/pb2json.cc +++ b/src/ge/common/convert/pb2json.cc @@ -20,13 +20,16 @@ #include "common/convert/pb2json.h" #include #include - +#include "securec.h" #include "framework/common/fmk_types.h" using std::set; using std::string; namespace ge { +namespace { +const int kSignificantDigits = 10; +} // JSON parses non utf8 character throwing exceptions, so some fields need to be shielded through black fields FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void Pb2Json::Message2Json(const ProtobufMsg &message, const set &black_fields, Json &json, @@ -75,7 +78,7 @@ void Pb2Json::OneField2Json(const ProtobufMsg &message, const ProtobufFieldDescr case ProtobufFieldDescriptor::TYPE_MESSAGE: { const ProtobufMsg &tmp_message = reflection->GetMessage(message, field); if (0 != tmp_message.ByteSize()) { - Message2Json(tmp_message, black_fields, json[field->name()]); + Message2Json(tmp_message, black_fields, json[field->name()], enum2str); } break; } @@ -113,24 +116,47 @@ void Pb2Json::OneField2Json(const ProtobufMsg &message, const ProtobufFieldDescr break; case ProtobufFieldDescriptor::TYPE_FLOAT: - json[field->name()] = reflection->GetFloat(message, field); + char str[kSignificantDigits]; + sprintf_s(str, kSignificantDigits, "%g", reflection->GetFloat(message, field)); + json[field->name()] = str; break; case ProtobufFieldDescriptor::TYPE_STRING: - case ProtobufFieldDescriptor::TYPE_BYTES: json[field->name()] = reflection->GetString(message, field); break; + case ProtobufFieldDescriptor::TYPE_BYTES: { + string field_name = field->name(); + string type_bytes = reflection->GetString(message, field); + json[field_name] = TypeBytes2String(field_name, type_bytes); + break; + } + default: break; } } +string Pb2Json::TypeBytes2String(string &field_name, string &type_bytes) { + if (field_name != "offset") { + return type_bytes; + } + string result = ""; + for (char temp_value : type_bytes) { + uint8_t *value = 0; + value = reinterpret_cast(&temp_value); + char str[kSignificantDigits]; + sprintf_s(str, kSignificantDigits, "%d", *value); + result += str; + } + return result; +} + void Pb2Json::RepeatedMessage2Json(const ProtobufMsg &message, const ProtobufFieldDescriptor *field, const ProtobufReflection *reflection, const set &black_fields, Json &json, bool enum2str) { if (nullptr == field || nullptr == reflection) { - Message2Json(message, black_fields, json); + Message2Json(message, black_fields, json, enum2str); return; } @@ -140,7 +166,7 @@ void Pb2Json::RepeatedMessage2Json(const ProtobufMsg &message, const ProtobufFie case ProtobufFieldDescriptor::TYPE_MESSAGE: { const ProtobufMsg &tmp_message = reflection->GetRepeatedMessage(message, field, i); if (0 != tmp_message.ByteSize()) { - Message2Json(tmp_message, black_fields, tmp_json); + Message2Json(tmp_message, black_fields, tmp_json, enum2str); } } break; diff --git a/src/ge/common/convert/pb2json.h b/src/ge/common/convert/pb2json.h index 3f4fe84c..88ded50e 100644 --- a/src/ge/common/convert/pb2json.h +++ b/src/ge/common/convert/pb2json.h @@ -60,6 +60,8 @@ class Pb2Json { static void OneField2Json(const ProtobufMsg &message, const ProtobufFieldDescriptor *field, const ProtobufReflection *reflection, const std::set &black_fields, Json &json, bool enum2str); + + static std::string TypeBytes2String(std::string &field_name, std::string &type_bytes); }; } // namespace ge diff --git a/src/ge/common/debug/memory_dumper.cc b/src/ge/common/debug/memory_dumper.cc index 67a615f4..0534ff81 100644 --- a/src/ge/common/debug/memory_dumper.cc +++ b/src/ge/common/debug/memory_dumper.cc @@ -37,14 +37,18 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY MemoryDumper::~MemoryDumper() { // Dump the data to the file FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::DumpToFile(const char *filename, void *data, - uint32_t len) { - GE_CHK_BOOL_RET_STATUS(!(filename == nullptr || data == nullptr || len == 0), FAILED, - "Incorrect parameter. filename is nullptr || data is nullptr || len is 0"); - + int64_t len) { #ifdef FMK_SUPPORT_DUMP + GE_CHECK_NOTNULL(filename); + GE_CHECK_NOTNULL(data); + if (len == 0) { + GELOGE(FAILED, "len is 0."); + return PARAM_INVALID; + } + // Open the file int fd = OpenFile(filename); - if (kInvalidFd == fd) { + if (fd == kInvalidFd) { GELOGE(FAILED, "Open file failed."); return FAILED; } @@ -54,7 +58,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::DumpToFile int32_t mmpa_ret = mmWrite(fd, data, len); // mmWrite return -1:Failed to write data to file;return -2:Invalid parameter if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { - GELOGE(FAILED, "Write to file failed. errno = %d", mmpa_ret); + GELOGE(FAILED, "Write to file failed. errno = %d, %s", mmpa_ret, strerror(errno)); ret = FAILED; } @@ -65,7 +69,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::DumpToFile } return ret; - #else GELOGW("need to define FMK_SUPPORT_DUMP for dump op input and output."); return SUCCESS; @@ -102,12 +105,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::Dump(void int32_t mmpa_ret = mmWrite(fd_, data, len); // mmWrite return -1:failed to write data to file;return -2:invalid parameter if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { - GELOGE(FAILED, "Write to file failed. errno = %d", mmpa_ret); + GELOGE(FAILED, "Write to file failed. errno = %d, %s", mmpa_ret, strerror(errno)); return FAILED; } return SUCCESS; - #else GELOGW("need to define FMK_SUPPORT_DUMP for dump op input and output."); return SUCCESS; @@ -155,7 +157,7 @@ int MemoryDumper::OpenFile(const char *filename) { int32_t fd = mmOpen2(real_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { - GELOGE(kInvalidFd, "Open file failed. errno = %d", fd); + GELOGE(kInvalidFd, "open file failed. errno = %d, %s", fd, strerror(errno)); return kInvalidFd; } return fd; diff --git a/src/ge/common/debug/memory_dumper.h b/src/ge/common/debug/memory_dumper.h index 3cb87c99..4995f5f7 100644 --- a/src/ge/common/debug/memory_dumper.h +++ b/src/ge/common/debug/memory_dumper.h @@ -45,7 +45,7 @@ class MemoryDumper { * @return FAILED output failed * @author */ - static Status DumpToFile(const char *filename, void *data, uint32_t len); + static Status DumpToFile(const char *filename, void *data, int64_t len); /** @ingroup domi_common * @brief open the dump file diff --git a/src/ge/common/formats/format_transfers/datatype_transfer.cc b/src/ge/common/formats/format_transfers/datatype_transfer.cc index 935880eb..bac3a178 100644 --- a/src/ge/common/formats/format_transfers/datatype_transfer.cc +++ b/src/ge/common/formats/format_transfers/datatype_transfer.cc @@ -27,6 +27,8 @@ #include "graph/utils/type_utils.h" #include "securec.h" +using ge::fp16_t; + namespace ge { namespace formats { diff --git a/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc b/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc index 79194962..3458f83c 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc @@ -27,7 +27,9 @@ namespace ge { namespace formats { namespace { -bool CheckDataTypeSupported(const DataType &data_type) { return (data_type == DT_FLOAT || data_type == DT_FLOAT16); } +bool CheckDataTypeSupported(const DataType &data_type) { + return (data_type == DT_FLOAT || data_type == DT_FLOAT16 || data_type == DT_INT8); +} Status CheckArgsForC1hwncoc0ToHwcn(const TransArgs &args) { auto src_shape = args.src_shape; @@ -51,10 +53,11 @@ Status CheckArgsForC1hwncoc0ToHwcn(const TransArgs &args) { GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); return PARAM_INVALID; } - if (src_shape.at(kC1hwncoc0C1) != (dst_shape.at(kHwcnC) - 1) / kCubeSize + 1 || + auto cube_size = GetCubeSizeByDataType(args.src_data_type); + if (src_shape.at(kC1hwncoc0C1) != (dst_shape.at(kHwcnC) - 1) / cube_size + 1 || src_shape.at(kC1hwncoc0H) != dst_shape.at(kHwcnH) || src_shape.at(kC1hwncoc0W) != dst_shape.at(kHwcnW) || - src_shape.at(kC1hwncoc0N) != dst_shape.at(kHwcnN) || src_shape.at(kC1hwncoc0Co) != kCubeSize || - src_shape.at(kC1hwncoc0C0) != kCubeSize) { + src_shape.at(kC1hwncoc0N) != dst_shape.at(kHwcnN) || src_shape.at(kC1hwncoc0Co) != cube_size || + src_shape.at(kC1hwncoc0C0) != cube_size) { GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); return PARAM_INVALID; @@ -78,6 +81,7 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size auto c0 = args.src_shape.at(kC1hwncoc0C0); auto co = args.src_shape.at(kC1hwncoc0Co); auto c = args.dst_shape.at(kHwcnC); + auto cube_size = GetCubeSizeByDataType(args.src_data_type); int64_t cn = c * n; int64_t wcn = w * cn; int64_t coc0 = co * c0; @@ -93,16 +97,16 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, int size int64_t c_head_addr = w_head_addr + c_idx * n; for (int64_t n_idx = 0; n_idx < n; n_idx++) { int64_t dst_idx = c_head_addr + n_idx; - int64_t c1_idx = c_idx / kCubeSize; - int64_t c0_idx = c_idx % kCubeSize; + int64_t c1_idx = c_idx / cube_size; + int64_t c0_idx = c_idx % cube_size; int64_t co_idx = c0_idx; int64_t src_idx = c1_idx * hwncoc0 + h_idx * wncoc0 + w_idx * ncoc0 + n_idx * coc0 + co_idx * c0 + c0_idx; auto src_offset = src_idx * size; auto dst_offset = dst_idx * size; // The memcpy_s/memset_s argument `dstMax` must be less than 2G auto protected_size = total_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? total_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? total_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc new file mode 100644 index 00000000..45808fa0 --- /dev/null +++ b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc @@ -0,0 +1,179 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/formats/format_transfers/format_transfer_dhwcn_fracz3D.h" + +#include +#include + +#include "common/formats/utils/formats_definitions.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "framework/common/debug/ge_log.h" +#include "graph/utils/type_utils.h" + +namespace ge { +namespace formats { +namespace { +Status CheckDataTypeSupport(DataType dtype) { return GetSizeByDataType(dtype) > 0 ? SUCCESS : UNSUPPORTED; } + +Status TransShapeToFz(int64_t d, int64_t n, int64_t c, int64_t h, int64_t w, DataType data_type, + std::vector &dst_shape) { + auto c0 = GetCubeSizeByDataType(data_type); + if (c0 < 0) { + return UNSUPPORTED; + } + + auto c1 = Ceil(c, c0); + auto no = Ceil(n, static_cast(kNiSize)); + + dst_shape.clear(); + dst_shape.push_back(d * c1 * h * w); + dst_shape.push_back(no); + dst_shape.push_back(kNiSize); + dst_shape.push_back(c0); + + return SUCCESS; +} + +Status TransShapeDhwckToFz3D(const std::vector &src_shape, DataType data_type, + std::vector &dst_shape) { + if (!CheckShapeValid(src_shape, kDhwcnDimsNum)) { + return PARAM_INVALID; + } + auto d = src_shape.at(kDhwcnD); + auto h = src_shape.at(kDhwcnH); + auto w = src_shape.at(kDhwcnW); + auto c = src_shape.at(kDhwcnC); + auto n = src_shape.at(kDhwcnN); + + return TransShapeToFz(d, n, c, h, w, data_type, dst_shape); +} +Status TransFormatDhwckToFz3D(const TransArgs &args, TransResult &result) { + if (!CheckShapeValid(args.src_shape, kDhwcnDimsNum)) { + return PARAM_INVALID; + } + int64_t d = args.src_shape[kDhwcnD]; + int64_t h = args.src_shape[kDhwcnH]; + int64_t w = args.src_shape[kDhwcnW]; + int64_t c = args.src_shape[kDhwcnC]; + int64_t n = args.src_shape[kDhwcnN]; + int64_t n1n0 = Ceil(n, static_cast(kNiSize)) * kNiSize; + int64_t c0 = GetCubeSizeByDataType(args.src_data_type); + int64_t c1 = Ceil(c, c0); + + auto cn = c * n; + auto wcn = w * cn; + auto hwcn = h * wcn; + auto n1n0c0 = n1n0 * c0; + auto wn1n0c0 = w * n1n0c0; + auto hwn1n0c0 = h * wn1n0c0; + auto c1hwn1n0c0 = c1 * hwn1n0c0; + + int64_t data_size = GetSizeByDataType(args.src_data_type); + int64_t dst_size = 1; + for (auto dim : args.dst_shape) { + dst_size *= dim; + } + dst_size *= data_size; + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); + if (dst == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + return OUT_OF_MEMORY; + } + + for (int64_t di = 0; di < d; di++) { + for (int64_t c1i = 0; c1i < c1; c1i++) { + for (int64_t hi = 0; hi < h; hi++) { + for (int64_t wi = 0; wi < w; wi++) { + for (int64_t n1n0i = 0; n1n0i < n1n0; n1n0i++) { + for (int64_t c0i = 0; c0i < c0; c0i++) { + int64_t dst_idx = di * c1hwn1n0c0 + c1i * hwn1n0c0 + hi * wn1n0c0 + wi * n1n0c0 + n1n0i * c0 + c0i; + int64_t dst_offset = dst_idx * data_size; + auto pad_zero = ((c1i * c0 + c0i) >= c) || (n1n0i >= n); + auto protected_size = dst_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) + ? dst_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); + errno_t ret; + if (pad_zero) { + ret = memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, + static_cast(data_size)); + } else { + int64_t src_idx = di * hwcn + hi * wcn + wi * cn + (c1i * c0 + c0i) * n + n1n0i; + ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), + args.data + src_idx * data_size, static_cast(data_size)); + } + if (ret != EOK) { + GELOGE(INTERNAL_ERROR, "Failed to operate the dst memory at offset %ld, error-code %d, pad mode %d", + dst_offset, ret, pad_zero); + return INTERNAL_ERROR; + } + } + } + } + } + } + } + result.data = dst; + result.length = dst_size; + return SUCCESS; +} +} // namespace + +Status FormatTransferDhwcnFractalZ3D::TransFormat(const TransArgs &args, TransResult &result) { + GELOGD("Begin to trans format from %s to %s, src shape %s, data type %s, dst shape %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str()); + std::vector expect_shape; + auto ret = TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, expect_shape); + if (ret != SUCCESS) { + return ret; + } + if (!args.dst_shape.empty() && args.dst_shape != expect_shape) { + GELOGE(PARAM_INVALID, "Failed to trans format from %s to %s, the dst shape %s is invalid, expect %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.dst_shape).c_str(), + ShapeToString(expect_shape).c_str()); + return PARAM_INVALID; + } + + if (args.src_format == FORMAT_DHWCN && args.dst_format == FORMAT_FRACTAL_Z_3D) { + return TransFormatDhwckToFz3D(args, result); + } + + return UNSUPPORTED; +} + +Status FormatTransferDhwcnFractalZ3D::TransShape(Format src_format, const std::vector &src_shape, + DataType data_type, Format dst_format, + std::vector &dst_shape) { + if (CheckDataTypeSupport(data_type) != SUCCESS) { + return UNSUPPORTED; + } + + if (src_format == FORMAT_DHWCN && dst_format == FORMAT_FRACTAL_Z_3D) { + return TransShapeDhwckToFz3D(src_shape, data_type, dst_shape); + } + + return UNSUPPORTED; +} + +REGISTER_FORMAT_TRANSFER(FormatTransferDhwcnFractalZ3D, FORMAT_DHWCN, FORMAT_FRACTAL_Z_3D) + +} // namespace formats +} // namespace ge diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.h b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.h new file mode 100644 index 00000000..55549cb3 --- /dev/null +++ b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.h @@ -0,0 +1,34 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_DHWCN_FRACTAL_Z_3D_H_ +#define GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_DHWCN_FRACTAL_Z_3D_H_ + +#include +#include "common/formats/format_transfers/format_transfer.h" + +namespace ge { +namespace formats { +class FormatTransferDhwcnFractalZ3D : public FormatTransfer { + public: + Status TransFormat(const TransArgs &args, TransResult &result) override; + Status TransShape(Format src_format, const std::vector &src_shape, DataType data_type, Format dst_format, + std::vector &dst_shape) override; +}; +} // namespace formats +} // namespace ge + +#endif // GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_DHWCN_FRACTAL_Z_3D_H_ diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc new file mode 100644 index 00000000..86c6935d --- /dev/null +++ b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc @@ -0,0 +1,180 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.h" + +#include +#include + +#include "common/formats/utils/formats_definitions.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "framework/common/debug/ge_log.h" +#include "graph/utils/type_utils.h" + +namespace ge { +namespace formats { +namespace { +Status CheckDataTypeSupport(DataType dtype) { return GetSizeByDataType(dtype) > 0 ? SUCCESS : UNSUPPORTED; } + +Status TransShapeToFz(int64_t d, int64_t n, int64_t c, int64_t h, int64_t w, DataType data_type, + std::vector &dst_shape) { + auto c0 = GetCubeSizeByDataType(data_type); + if (c0 < 0) { + return UNSUPPORTED; + } + + auto c1 = Ceil(c, c0); + auto no = Ceil(n, static_cast(kNiSize)); + + dst_shape.clear(); + dst_shape.push_back(d * c1 * h * w); + dst_shape.push_back(no); + dst_shape.push_back(kNiSize); + dst_shape.push_back(c0); + + return SUCCESS; +} + +Status TransShapeDhwncToFz3DTranspose(const std::vector &src_shape, DataType data_type, + std::vector &dst_shape) { + if (!CheckShapeValid(src_shape, kDhwncDimsNum)) { + return PARAM_INVALID; + } + auto d = src_shape.at(kDhwncD); + auto h = src_shape.at(kDhwncH); + auto w = src_shape.at(kDhwncW); + auto n = src_shape.at(kDhwncN); + auto c = src_shape.at(kDhwncC); + // exchange n c, normalize process with dhwcn to fraz3D + return TransShapeToFz(d, c, n, h, w, data_type, dst_shape); +} +Status TransFormatDhwncToFz3DTranspose(const TransArgs &args, TransResult &result) { + if (!CheckShapeValid(args.src_shape, kDhwncDimsNum)) { + return PARAM_INVALID; + } + int64_t d = args.src_shape[kDhwncD]; + int64_t h = args.src_shape[kDhwncH]; + int64_t w = args.src_shape[kDhwncW]; + // exchange nc ,for normalize process with dhwcn to Fz3D + int64_t c = args.src_shape[kDhwncN]; + int64_t n = args.src_shape[kDhwncC]; + int64_t n1n0 = Ceil(n, static_cast(kNiSize)) * kNiSize; + int64_t c0 = GetCubeSizeByDataType(args.src_data_type); + int64_t c1 = Ceil(c, c0); + + auto cn = c * n; + auto wcn = w * cn; + auto hwcn = h * wcn; + auto n1n0c0 = n1n0 * c0; + auto wn1n0c0 = w * n1n0c0; + auto hwn1n0c0 = h * wn1n0c0; + auto c1hwn1n0c0 = c1 * hwn1n0c0; + + int64_t data_size = GetSizeByDataType(args.src_data_type); + int64_t dst_size = 1; + for (auto dim : args.dst_shape) { + dst_size *= dim; + } + dst_size *= data_size; + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); + if (dst == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + return OUT_OF_MEMORY; + } + + for (int64_t di = 0; di < d; di++) { + for (int64_t c1i = 0; c1i < c1; c1i++) { + for (int64_t hi = 0; hi < h; hi++) { + for (int64_t wi = 0; wi < w; wi++) { + for (int64_t n1n0i = 0; n1n0i < n1n0; n1n0i++) { + for (int64_t c0i = 0; c0i < c0; c0i++) { + int64_t dst_idx = di * c1hwn1n0c0 + c1i * hwn1n0c0 + hi * wn1n0c0 + wi * n1n0c0 + n1n0i * c0 + c0i; + int64_t dst_offset = dst_idx * data_size; + auto protected_size = dst_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) + ? dst_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); + auto pad_zero = ((c1i * c0 + c0i) >= c) || (n1n0i >= n); + errno_t ret; + if (pad_zero) { + ret = memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, + static_cast(data_size)); + } else { + int64_t src_idx = di * hwcn + hi * wcn + wi * cn + (c1i * c0 + c0i) * n + n1n0i; + ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), + args.data + src_idx * data_size, static_cast(data_size)); + } + if (ret != EOK) { + GELOGE(INTERNAL_ERROR, "Failed to operate the dst memory at offset %ld, error-code %d, pad mode %d", + dst_offset, ret, pad_zero); + return INTERNAL_ERROR; + } + } + } + } + } + } + } + result.data = dst; + result.length = dst_size; + return SUCCESS; +} +} // namespace + +Status FormatTransferDhwncFractalZ3DTranspose::TransFormat(const TransArgs &args, TransResult &result) { + GELOGD("Begin to trans format from %s to %s, src shape %s, data type %s, dst shape %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str()); + std::vector expect_shape; + auto ret = TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, expect_shape); + if (ret != SUCCESS) { + return ret; + } + if (!args.dst_shape.empty() && args.dst_shape != expect_shape) { + GELOGE(PARAM_INVALID, "Failed to trans format from %s to %s, the dst shape %s is invalid, expect %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.dst_shape).c_str(), + ShapeToString(expect_shape).c_str()); + return PARAM_INVALID; + } + + if (args.src_format == ge::FORMAT_DHWNC && args.dst_format == ge::FORMAT_FRACTAL_Z_3D_TRANSPOSE) { + return TransFormatDhwncToFz3DTranspose(args, result); + } + + return UNSUPPORTED; +} + +Status FormatTransferDhwncFractalZ3DTranspose::TransShape(Format src_format, const std::vector &src_shape, + DataType data_type, Format dst_format, + std::vector &dst_shape) { + if (CheckDataTypeSupport(data_type) != SUCCESS) { + return UNSUPPORTED; + } + + if (src_format == FORMAT_DHWNC && dst_format == FORMAT_FRACTAL_Z_3D_TRANSPOSE) { + return TransShapeDhwncToFz3DTranspose(src_shape, data_type, dst_shape); + } + + return UNSUPPORTED; +} + +REGISTER_FORMAT_TRANSFER(FormatTransferDhwncFractalZ3DTranspose, FORMAT_DHWNC, FORMAT_FRACTAL_Z_3D_TRANSPOSE) + +} // namespace formats +} // namespace ge diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.h b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.h new file mode 100644 index 00000000..82a9e14f --- /dev/null +++ b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.h @@ -0,0 +1,34 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_DHWNC_FRACTAL_Z_3D_TRANSPOSE_H_ +#define GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_DHWNC_FRACTAL_Z_3D_TRANSPOSE_H_ + +#include +#include "common/formats/format_transfers/format_transfer.h" + +namespace ge { +namespace formats { +class FormatTransferDhwncFractalZ3DTranspose : public FormatTransfer { + public: + Status TransFormat(const TransArgs &args, TransResult &result) override; + Status TransShape(Format src_format, const std::vector &src_shape, DataType data_type, Format dst_format, + std::vector &dst_shape) override; +}; +} // namespace formats +} // namespace ge + +#endif // GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_DHWNC_FRACTAL_Z_3D_TRANSPOSE_H_ diff --git a/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc b/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc index 2a223563..e7f6754f 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc @@ -27,16 +27,20 @@ namespace ge { namespace formats { namespace { -bool CheckDataTypeSupported(const DataType &data_type) { return (data_type == DT_FLOAT || data_type == DT_FLOAT16); } +bool CheckDataTypeSupported(const DataType &data_type) { + return (data_type == DT_FLOAT || data_type == DT_FLOAT16 || data_type == DT_INT8); +} -Status TransShapeHwcnToC1hwncoc0(const std::vector &src_shape, std::vector &dst_shape) { +Status TransShapeHwcnToC1hwncoc0(const DataType &data_type, const std::vector &src_shape, + std::vector &dst_shape) { + auto cube_size = GetCubeSizeByDataType(data_type); dst_shape.clear(); - dst_shape.push_back((src_shape.at(kHwcnC) - 1) / kCubeSize + 1); + dst_shape.push_back((src_shape.at(kHwcnC) - 1) / cube_size + 1); dst_shape.push_back(src_shape.at(kHwcnH)); dst_shape.push_back(src_shape.at(kHwcnW)); dst_shape.push_back(src_shape.at(kHwcnN)); - dst_shape.push_back(kCubeSize); - dst_shape.push_back(kCubeSize); + dst_shape.push_back(cube_size); + dst_shape.push_back(cube_size); if (!CheckShapeValid(dst_shape, kC1hwncoc0DimsNum)) { GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); return PARAM_INVALID; @@ -65,7 +69,7 @@ Status CheckArgsForHwcnToC1hwncoc0(const TransArgs &args) { return PARAM_INVALID; } std::vector expect_dst_shape; - auto ret = TransShapeHwcnToC1hwncoc0(args.src_shape, expect_dst_shape); + auto ret = TransShapeHwcnToC1hwncoc0(args.src_data_type, args.src_shape, expect_dst_shape); if (ret != SUCCESS) { return ret; } @@ -118,8 +122,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in int64_t dst_idx = c0_idx + co_head_addr; auto dst_offset = dst_idx * size; auto protected_size = total_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? total_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? total_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); int64_t c_idx = c0_idx + c1_idx * c0; int64_t src_idx = h_idx * wcn + w_idx * cn + c_idx * n + n_idx; auto src_offset = src_idx * size; @@ -137,7 +141,7 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in } } else { auto ret = - memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, static_cast(size)); + memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, static_cast(size)); if (ret != EOK) { GELOGE(INTERNAL_ERROR, "Failed to set to 0 to C1HWNCoC0[%ld, %ld, %ld, %ld, %ld, %ld] offset %ld, " @@ -188,7 +192,7 @@ Status FormatTransferHwcnC1hwncoc0::TransShape(Format src_format, const std::vec GELOGE(PARAM_INVALID, "Failed to check src shape %s", ShapeToString(src_shape).c_str()); return PARAM_INVALID; } - return TransShapeHwcnToC1hwncoc0(src_shape, dst_shape); + return TransShapeHwcnToC1hwncoc0(data_type, src_shape, dst_shape); } else { return UNSUPPORTED; } diff --git a/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc new file mode 100644 index 00000000..481a64e9 --- /dev/null +++ b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc @@ -0,0 +1,306 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/formats/format_transfers/format_transfer_nchw_fz_c04.h" +#include "common/formats/format_transfers/format_transfer_transpose.h" + +#include +#include +#include + +#include "common/formats/utils/formats_definitions.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "common/util.h" +#include "framework/common/debug/ge_log.h" +#include "graph/utils/type_utils.h" + +/** 【Explain about transfer from nchw to FZ_CO4】 + * First Step: Padding in N and C axis. Here C must be less or equal than 4 + * After Padding, it will be like (n = ceil(n,16)*16, 4, h, w) + * Second Step: transpose. It will be like (n = ceil(n,16)*16, h, w, 4) + * Third Step: View the 4D as 2D , first dim is N, second dim is h*w*c. + * Padding to (N, ceil(Z/16)*16) + * Last Step: View the (N, ceil(Z/16)*16) as 4D (N/16, 16, C/16, 16) and transpose to (C/16, N/16, 16, 16) + */ + +namespace ge { +namespace formats { +namespace { + +constexpr int64_t kMaxDimsNumC = 4; + +Status CheckDataTypeSupport(DataType data_type) { return GetSizeByDataType(data_type) > 0 ? SUCCESS : UNSUPPORTED; } + +Status TransShape(int64_t n, int64_t c, int64_t h, int64_t w, DataType data_type, std::vector &dst_shape) { + auto c0 = GetCubeSizeByDataType(data_type); + if (c0 < 0) { + return UNSUPPORTED; + } + auto chw = c * h * w; + + auto first_dim = Ceil(chw, c0); + auto no = Ceil(n, static_cast(c0)); + + dst_shape.clear(); + dst_shape.push_back(first_dim); + dst_shape.push_back(no); + dst_shape.push_back(c0); + dst_shape.push_back(c0); + + if (!IsShapeValid(dst_shape)) { + GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str()); + return PARAM_INVALID; + } + return SUCCESS; +} + +Status TransShapeNchwToFzC04(const std::vector &src_shape, DataType data_type, + std::vector &dst_shape) { + if (!CheckShapeValid(src_shape, kNchwDimsNum)) { + return PARAM_INVALID; + } + + auto n = src_shape.at(kNchwN); + auto c = src_shape.at(kNchwC); + auto h = src_shape.at(kNchwH); + auto w = src_shape.at(kNchwW); + return TransShape(n, c, h, w, data_type, dst_shape); +} + +Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { + int64_t n = args.src_shape.at(kNchwN); + int64_t c = args.src_shape.at(kNchwC); + int64_t h = args.src_shape.at(kNchwH); + int64_t w = args.src_shape.at(kNchwW); + + int64_t c0 = GetCubeSizeByDataType(args.src_data_type); + int size = GetSizeByDataType(args.src_data_type); + + auto data = args.data; + TransResult trans_result_1; + std::vector perm_arg_1 = {0, 2, 3, 1}; + std::vector expect_shape = {n, h, w, c}; + auto ret = ge::formats::Transpose(data, args.src_shape, args.src_data_type, perm_arg_1, trans_result_1); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to Transpose from NCHW to HWCN"); + return NOT_CHANGED; + } + + TransArgs args_tmp = args; + args_tmp.src_shape = expect_shape; + args_tmp.data = trans_result_1.data.get(); + // check size it should be same with original + size_t expect_size = n * c * h * w * size; // before has do check about mul + if (trans_result_1.length != expect_size) { + GELOGE(INTERNAL_ERROR, "size is not match after transpose!"); + return NOT_CHANGED; + } + + /* prepare for padding in chw*/ + int64_t tmp = h * w * c; + int64_t n_o = Ceil(n, static_cast(c0)); + int64_t c_o = c0; + int64_t h_o = Ceil(tmp, c0); + int64_t w_o = c0; + std::vector shape_o = {n_o, c_o, h_o, w_o}; + + // data overflow check totally + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(h_o, w_o), + GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", h_o, w_o); + return INTERNAL_ERROR); + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(n_o, c_o), + GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", n_o, c_o); + return INTERNAL_ERROR); + auto t1 = h_o * w_o; + auto t2 = n_o * c_o; + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(t1, t2), GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", t1, t2); + return INTERNAL_ERROR); + + int64_t total_ele_cnt = n_o * c_o * h_o * w_o; + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(total_ele_cnt, size), + GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", total_ele_cnt, size); + return INTERNAL_ERROR); + int64_t dst_size = total_ele_cnt * size; + + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); + if (dst == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + return OUT_OF_MEMORY; + } + auto retMem = memset_s(dst.get(), dst_size, 0, dst_size); + if (retMem != EOK) { + GELOGE(INTERNAL_ERROR, "memst failed!"); + return INTERNAL_ERROR; + } + // copy data + auto block = c * h * w * size; + auto stride = h_o * w_o * size; + auto p_s = trans_result_1.data.get(); + auto p_d = dst.get(); + auto protectSize = dst_size; + for (auto k = 0; k < n; k++) { + ret = memcpy_s(p_d + k * stride, protectSize, p_s + k * block, block); + if (ret != EOK) { + GELOGE(INTERNAL_ERROR, "memcpy_s failed!"); + return INTERNAL_ERROR; + } + protectSize = protectSize - block; + } + + // transpose : 2,0,1,3 + std::vector perm_arg_2 = {2, 0, 1, 3}; + ret = ge::formats::Transpose(dst.get(), shape_o, args.src_data_type, perm_arg_2, result); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to Transpose from NCHW to HWCN"); + return NOT_CHANGED; + } + + return SUCCESS; +} + +Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr &dst) { + args_tmp = args; + auto src_shape = args_tmp.src_shape; + if (!CheckShapeValid(src_shape, kNchwDimsNum)) { + return PARAM_INVALID; + } + int64_t c0 = GetCubeSizeByDataType(args.src_data_type); + + auto n = src_shape.at(kNchwN); + auto c = src_shape.at(kNchwC); + auto h = src_shape.at(kNchwH); + auto w = src_shape.at(kNchwW); + + if (c > kMaxDimsNumC) { + GELOGE(PARAM_INVALID, "Invalie dim c num[%lu].It should be in (0,4]", c); + return PARAM_INVALID; + } + + auto n_o = Ceil(n, c0) * c0; + auto c_o = kMaxDimsNumC; + auto h_o = h; + auto w_o = w; + args_tmp.src_shape.at(kNchwN) = n_o; + args_tmp.src_shape.at(kNchwC) = c_o; + args_tmp.src_shape.at(kNchwH) = h_o; + args_tmp.src_shape.at(kNchwW) = w_o; + + // data overflow check + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(h_o, w_o), + GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", h_o, w_o); + return INTERNAL_ERROR); + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(n_o, c_o), + GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", n_o, c_o); + return INTERNAL_ERROR); + auto t1 = h_o * w_o; + auto t2 = n_o * c_o; + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(t1, t2), GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", t1, t2); + return INTERNAL_ERROR); + + int64_t total_ele_cnt = n_o * c_o * h_o * w_o; + int size = GetSizeByDataType(args.src_data_type); + GE_IF_BOOL_EXEC(!CheckInt64MulOverflow(total_ele_cnt, size), + GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", total_ele_cnt, size); + return INTERNAL_ERROR); + + int64_t dst_size = total_ele_cnt * size; + dst.reset(new (std::nothrow) uint8_t[dst_size], std::default_delete()); + if (dst == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), dst_size); + return OUT_OF_MEMORY; + } + auto ret = memset_s(dst.get(), dst_size, 0, dst_size); + if (ret != EOK) { + GELOGE(INTERNAL_ERROR, "memst failed!"); + return INTERNAL_ERROR; + } + + auto p_s = args.data; + auto p_d = dst.get(); + auto block = h * w * size; + auto protectSize = dst_size; + + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + ret = memcpy_s(p_d + (i * c_o * h_o * w_o + j * h_o * w_o) * size, protectSize, + p_s + (i * c * h * w + j * h * w) * size, block); + if (ret != EOK) { + GELOGE(INTERNAL_ERROR, "memcpy_s failed!"); + return INTERNAL_ERROR; + } + protectSize = protectSize - block; + } + } + args_tmp.data = dst.get(); + + return SUCCESS; +} +} // namespace + +Status FormatTransferNchwToFZC04::TransFormat(const TransArgs &args, TransResult &result) { + GELOGD("Begin to trans format from %s to %s, src shape %s, data type %s, dst shape %s", + TypeUtils::FormatToSerialString(args.src_format).c_str(), + TypeUtils::FormatToSerialString(args.dst_format).c_str(), ShapeToString(args.src_shape).c_str(), + TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str()); + TransArgs args_tmp = args; + std::shared_ptr dst = nullptr; + auto ret = PaddingNC(args, args_tmp, dst); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Padding in NC axis failed!"); + return ret; + } + + std::vector expect_shape; + ret = TransShape(args_tmp.src_format, args_tmp.src_shape, args_tmp.src_data_type, args_tmp.dst_format, expect_shape); + if (ret != SUCCESS) { + return ret; + } + + if (!args_tmp.dst_shape.empty() && args_tmp.dst_shape != expect_shape) { + GELOGE(PARAM_INVALID, "Failed to trans format from %s to %s, the dst shape %s is invalid, expect %s", + TypeUtils::FormatToSerialString(args_tmp.src_format).c_str(), + TypeUtils::FormatToSerialString(args_tmp.dst_format).c_str(), ShapeToString(args_tmp.dst_shape).c_str(), + ShapeToString(expect_shape).c_str()); + return PARAM_INVALID; + } + + if (args_tmp.src_format == FORMAT_NCHW && args_tmp.dst_format == FORMAT_FRACTAL_Z_C04) { + return TransFormatFromNchwToFzC04(args_tmp, result); + } + + return UNSUPPORTED; +} + +Status FormatTransferNchwToFZC04::TransShape(Format src_format, const std::vector &src_shape, + DataType data_type, Format dst_format, std::vector &dst_shape) { + if (CheckDataTypeSupport(data_type) != SUCCESS) { + return UNSUPPORTED; + } + if (src_format == FORMAT_NCHW && dst_format == FORMAT_FRACTAL_Z_C04) { + return TransShapeNchwToFzC04(src_shape, data_type, dst_shape); + } + + return UNSUPPORTED; +} + +REGISTER_FORMAT_TRANSFER(FormatTransferNchwToFZC04, FORMAT_NCHW, FORMAT_FRACTAL_Z_C04) + +} // namespace formats +} // namespace ge diff --git a/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.h b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.h new file mode 100644 index 00000000..a1232d47 --- /dev/null +++ b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.h @@ -0,0 +1,35 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_FORMATS_FORMAT_TRANSFERS_NCHW_FZC04_H_ +#define GE_COMMON_FORMATS_FORMAT_TRANSFERS_NCHW_FZC04_H_ + +#include + +#include "common/formats/format_transfers/format_transfer.h" + +namespace ge { +namespace formats { +class FormatTransferNchwToFZC04 : public FormatTransfer { + public: + Status TransFormat(const ge::formats::TransArgs &args, ge::formats::TransResult &result) override; + Status TransShape(Format src_format, const std::vector &src_shape, DataType data_type, Format dst_format, + std::vector &dst_shape) override; +}; +} // namespace formats +} // namespace ge + +#endif // GE_COMMON_FORMATS_FORMAT_TRANSFERS_FORMAT_TRANSFER_FRACTAL_Z_H_ diff --git a/src/ge/common/formats/utils/formats_definitions.h b/src/ge/common/formats/utils/formats_definitions.h index 3bc394ce..d889c33c 100644 --- a/src/ge/common/formats/utils/formats_definitions.h +++ b/src/ge/common/formats/utils/formats_definitions.h @@ -24,38 +24,13 @@ static const int kCubeSize = 16; static const int kNiSize = 16; static const int64_t kShapeItemNumMAX = 1024UL * 1024UL * 1024UL * 1024UL; -enum NchwDimIndex { - kNchwN, - kNchwC, - kNchwH, - kNchwW, - kNchwDimsNum -}; +enum NchwDimIndex { kNchwN, kNchwC, kNchwH, kNchwW, kNchwDimsNum }; -enum NhwcDimIndex { - kNhwcN, - kNhwcH, - kNhwcW, - kNhwcC, - kNhwcDimsNum -}; +enum NhwcDimIndex { kNhwcN, kNhwcH, kNhwcW, kNhwcC, kNhwcDimsNum }; -enum HwcnDimIndex { - kHwcnH, - kHwcnW, - kHwcnC, - kHwcnN, - kHwcnDimsNum -}; +enum HwcnDimIndex { kHwcnH, kHwcnW, kHwcnC, kHwcnN, kHwcnDimsNum }; -enum Nc1hwc0DimIndex { - kNc1hwc0N, - kNc1hwc0C1, - kNc1hwc0H, - kNc1hwc0W, - kNc1hwc0C0, - kNc1hwc0DimsNum -}; +enum Nc1hwc0DimIndex { kNc1hwc0N, kNc1hwc0C1, kNc1hwc0H, kNc1hwc0W, kNc1hwc0C0, kNc1hwc0DimsNum }; enum C1hwncoc0DimIndex { kC1hwncoc0C1, @@ -67,13 +42,11 @@ enum C1hwncoc0DimIndex { kC1hwncoc0DimsNum }; -enum FracZDimIndex { - kFracZHWC1, - kFracZN0, - kFracZNi, - kFracZC0, - kFracZDimsNum -}; +enum FracZDimIndex { kFracZHWC1, kFracZN0, kFracZNi, kFracZC0, kFracZDimsNum }; + +enum DhwcnDimIndex { kDhwcnD, kDhwcnH, kDhwcnW, kDhwcnC, kDhwcnN, kDhwcnDimsNum }; + +enum DhwncDimIndex { kDhwncD, kDhwncH, kDhwncW, kDhwncN, kDhwncC, kDhwncDimsNum }; } // namespace formats } // namespace ge diff --git a/src/ge/common/fp16_t.cc b/src/ge/common/fp16_t.cc index 76dfe348..7b111e63 100644 --- a/src/ge/common/fp16_t.cc +++ b/src/ge/common/fp16_t.cc @@ -19,48 +19,33 @@ #include "external/register/register_types.h" namespace { -const int32_t kInt32SymbolShift = 31; -const int32_t kBitShift_32 = 32; -const int32_t kDim_2 = 2; -const int32_t kDim_11 = 11; -} // namespace - +constexpr uint16_t kManBitLength = 11; +} namespace ge { -union Fp16ToFloatData { - uint32_t uint_data; - float float_data; -}; - -/// /// @ingroup fp16_t global filed /// @brief round mode of last valid digital -/// -const fp16RoundMode_t g_round_mode = ROUND_TO_NEAREST; +enum TagFp16RoundMode g_round_mode = kRoundToNearest; -void ExtractFP16(const uint16_t &val, uint16_t *s, int16_t *e, uint16_t *m) { +void ExtractFp16(const uint16_t &val, uint16_t &s, int16_t &e, uint16_t &m) { // 1.Extract - *s = static_cast(FP16_EXTRAC_SIGN(val)); - *e = static_cast(FP16_EXTRAC_EXP(val)); - *m = static_cast(FP16_EXTRAC_MAN(val)); - + s = static_cast(FP16_EXTRAC_SIGN(val)); + e = static_cast(FP16_EXTRAC_EXP(val)); + m = static_cast(FP16_EXTRAC_MAN(val)); // Denormal - if ((*e) == 0) { - *e = 1; + if (e == 0) { + e = 1; } } - -/// /// @ingroup fp16_t static method /// @param [in] man truncated mantissa /// @param [in] shift_out left shift bits based on ten bits /// @brief judge whether to add one to the result while converting fp16_t to other datatype /// @return Return true if add one, otherwise false -/// static bool IsRoundOne(uint64_t man, uint16_t trunc_len) { uint64_t mask0 = 0x4; uint64_t mask1 = 0x2; uint64_t mask2; - uint16_t shift_out = static_cast(trunc_len - kDim_2); + uint16_t shift_out = static_cast(trunc_len - kDim2); mask0 = mask0 << shift_out; mask1 = mask1 << shift_out; mask2 = mask1 - 1; @@ -68,31 +53,768 @@ static bool IsRoundOne(uint64_t man, uint16_t trunc_len) { bool last_bit = ((man & mask0) > 0); bool trunc_high = false; bool trunc_left = false; - if (g_round_mode == ROUND_TO_NEAREST) { + if (g_round_mode == kRoundToNearest) { trunc_high = ((man & mask1) > 0); trunc_left = ((man & mask2) > 0); } return (trunc_high && (trunc_left || last_bit)); } - -/// /// @ingroup fp16_t public method /// @param [in] exp exponent of fp16_t value /// @param [in] man exponent of fp16_t value /// @brief normalize fp16_t value /// @return -/// static void Fp16Normalize(int16_t &exp, uint16_t &man) { - if (exp >= FP16_MAX_EXP) { - exp = FP16_MAX_EXP - 1; - man = FP16_MAX_MAN; - } else if (exp == 0 && man == FP16_MAN_HIDE_BIT) { + // set to invalid data + if (exp >= kFp16MaxExp) { + exp = static_cast(kFp16MaxExp); + man = static_cast(kFp16MaxMan); + } else if (exp == 0 && man == kFp16ManHideBit) { exp++; man = 0; } } -// Evaluation +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to float/fp32 +/// @return Return float/fp32 value of fp_val which is the value of fp16_t object +static float Fp16ToFloat(const uint16_t &fp_val) { + uint16_t hf_sign; + uint16_t hf_man; + int16_t hf_exp; + ExtractFp16(fp_val, hf_sign, hf_exp, hf_man); + + while (hf_man && !(hf_man & kFp16ManHideBit)) { + hf_man <<= 1; + hf_exp--; + } + + uint32_t e_ret, m_ret; + uint32_t s_ret = hf_sign; + if (hf_man == 0) { + e_ret = 0; + m_ret = 0; + } else { + e_ret = hf_exp - kFp16ExpBias + kFp32ExpBias; + m_ret = hf_man & kFp16ManMask; + m_ret = m_ret << (kFp32ManLen - kFp16ManLen); + } + uint32_t f_val = FP32_CONSTRUCTOR(s_ret, e_ret, m_ret); + auto p_ret_v = reinterpret_cast(&f_val); + + return *p_ret_v; +} +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to double/fp64 +/// @return Return double/fp64 value of fp_val which is the value of fp16_t object +static double Fp16ToDouble(const uint16_t &fp_val) { + uint16_t hf_sign; + uint16_t hf_man; + int16_t hf_exp; + ExtractFp16(fp_val, hf_sign, hf_exp, hf_man); + + while (hf_man && !(hf_man & kFp16ManHideBit)) { + hf_man <<= 1; + hf_exp--; + } + + uint64_t e_ret; + uint64_t m_ret; + uint64_t s_ret = hf_sign; + if (!hf_man) { + e_ret = 0; + m_ret = 0; + } else { + e_ret = hf_exp - kFp16ExpBias + kFp64ExpBias; + m_ret = hf_man & kFp16ManMask; + m_ret = m_ret << (kFp64ManLen - kFp16ManLen); + } + uint64_t f_val = (s_ret << kFp64SignIndex) | (e_ret << kFp64ManLen) | (m_ret); + auto p_ret_v = reinterpret_cast(&f_val); + + return *p_ret_v; +} +/// @ingroup fp16_t static method +/// @param [in] s_ret sign of fp16_t value +/// @param [in] long_int_m man uint64_t value of fp16_t object +/// @param [in] shift_out shift offset +/// @brief calculate uint8 value by sign,man and shift offset +/// @return Return uint8 value of fp16_t object +static uint8_t GetUint8ValByMan(uint8_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) { + bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen); + auto m_ret = static_cast((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max); + need_round = need_round && ((s_ret == 0 && m_ret < kInt8Max) || (s_ret == 1 && m_ret <= kInt8Max)); + if (need_round) { + m_ret++; + } + if (s_ret) { + m_ret = (~m_ret) + 1; + } + if (m_ret == 0) { + s_ret = 0; + } + return static_cast((s_ret << kBitShift7) | (m_ret)); +} +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to int8_t +/// @return Return int8_t value of fp_val which is the value of fp16_t object +static int8_t Fp16ToInt8(const uint16_t &fp_val) { + int8_t ret; + uint8_t ret_v; + // 1.get s_ret and shift it to bit0. + uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val); + // 2.get hf_e and hf_m + uint16_t hf_e = FP16_EXTRAC_EXP(fp_val); + uint16_t hf_m = FP16_EXTRAC_MAN(fp_val); + + if (FP16_IS_DENORM(fp_val)) { // Denormalized number + ret_v = 0; + ret = *(reinterpret_cast(&ret_v)); + return ret; + } + + uint64_t long_int_m = hf_m; + uint8_t overflow_flag = 0; + uint16_t shift_out = 0; + if (FP16_IS_INVALID(fp_val)) { // Inf or NaN + overflow_flag = 1; + } else { + while (hf_e != kFp16ExpBias) { + if (hf_e > kFp16ExpBias) { + hf_e--; + long_int_m = long_int_m << 1; + if (s_ret == 1 && long_int_m >= 0x20000u) { // sign=1,negative number(<0) + long_int_m = 0x20000u; // 10 0000 0000 0000 0000 10(fp16_t-man)+7(int8)=17bit + overflow_flag = 1; + break; + } else if (s_ret != 1 && long_int_m >= 0x1FFFFu) { // sign=0,positive number(>0) + long_int_m = 0x1FFFFu; // 01 1111 1111 1111 1111 10(fp16_t-man)+7(int8) + overflow_flag = 1; + break; + } + } else { + hf_e++; + shift_out++; + } + } + } + if (overflow_flag) { + ret_v = kInt8Max + s_ret; + } else { + // Generate final result + ret_v = GetUint8ValByMan(s_ret, long_int_m, shift_out); + } + + ret = *(reinterpret_cast(&ret_v)); + return ret; +} +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to uint8_t +/// @return Return uint8_t value of fp_val which is the value of fp16_t object +static uint8_t Fp16ToUInt8(const uint16_t &fp_val) { + uint8_t m_ret = 0; + // 1.get s_ret and shift it to bit0. + uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val); + // 2.get hf_e and hf_m + uint16_t hf_e = FP16_EXTRAC_EXP(fp_val); + uint16_t hf_m = FP16_EXTRAC_MAN(fp_val); + + if (FP16_IS_DENORM(fp_val)) { // Denormalized number + return 0; + } + + if (FP16_IS_INVALID(fp_val)) { // Inf or NaN + m_ret = ~0; + } else { + uint64_t long_int_m = hf_m; + uint8_t overflow_flag = 0; + uint16_t shift_out = 0; + while (hf_e != kFp16ExpBias) { + if (hf_e > kFp16ExpBias) { + hf_e--; + long_int_m = long_int_m << 1; + if (long_int_m >= 0x40000Lu) { // overflow 0100 0000 0000 0000 0000 + long_int_m = 0x3FFFFLu; // 11 1111 1111 1111 1111 10(fp16_t-man)+8(uint8)=18bit + overflow_flag = 1; + m_ret = ~0; + break; + } + } else { + hf_e++; + shift_out++; + } + } + if (!overflow_flag) { + bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen); + m_ret = static_cast((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max); + if (need_round && m_ret != kBitLen8Max) { + m_ret++; + } + } + } + + if (s_ret == 1) { // Negative number + m_ret = 0; + } + // m_ret equal to final result + return m_ret; +} +/// @ingroup fp16_t static method +/// @param [in] s_ret sign of fp16_t value +/// @param [in] long_int_m man uint64_t value of fp16_t object +/// @param [in] shift_out shift offset +/// @brief calculate uint16 value by sign,man and shift offset +/// @return Return uint16 value of fp16_t object +static uint16_t GetUint16ValByMan(uint16_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) { + bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen); + auto m_ret = static_cast((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max); + if (need_round && m_ret < kInt16Max) { + m_ret++; + } + if (s_ret) { + m_ret = (~m_ret) + 1; + } + if (m_ret == 0) { + s_ret = 0; + } + return static_cast((s_ret << kBitShift15) | (m_ret)); +} +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to int16_t +/// @return Return int16_t value of fp_val which is the value of fp16_t object +static int16_t Fp16ToInt16(const uint16_t &fp_val) { + int16_t ret; + uint16_t ret_v; + // 1.get s_ret and shift it to bit0. + uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val); + // 2.get hf_e and hf_m + uint16_t hf_e = FP16_EXTRAC_EXP(fp_val); + uint16_t hf_m = FP16_EXTRAC_MAN(fp_val); + + if (FP16_IS_DENORM(fp_val)) { // Denormalized number + ret_v = 0; + ret = *(reinterpret_cast(&ret_v)); + return ret; + } + + uint64_t long_int_m = hf_m; + uint8_t overflow_flag = 0; + uint16_t shift_out = 0; + if (FP16_IS_INVALID(fp_val)) { // Inf or NaN + overflow_flag = 1; + } else { + while (hf_e != kFp16ExpBias) { + if (hf_e > kFp16ExpBias) { + hf_e--; + long_int_m = long_int_m << 1; + if (s_ret == 1 && long_int_m > 0x2000000Lu) { // sign=1,negative number(<0) + long_int_m = 0x2000000Lu; // 10(fp16_t-man)+15(int16)=25bit + overflow_flag = 1; + break; + } else if (s_ret != 1 && long_int_m >= 0x1FFFFFFLu) { // sign=0,positive number(>0) Overflow + long_int_m = 0x1FFFFFFLu; // 10(fp16_t-man)+15(int16)=25bit + overflow_flag = 1; + break; + } + } else { + hf_e++; + shift_out++; + } + } + } + if (overflow_flag) { + ret_v = kInt16Max + s_ret; + } else { + // Generate final result + ret_v = GetUint16ValByMan(s_ret, long_int_m, shift_out); + } + ret = *(reinterpret_cast(&ret_v)); + return ret; +} +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to uint16_t +/// @return Return uint16_t value of fp_val which is the value of fp16_t object +static uint16_t Fp16ToUInt16(const uint16_t &fp_val) { + uint16_t m_ret = 0; + // 1.get s_ret and shift it to bit0. + uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val); + // 2.get hf_e and hf_m + uint16_t hf_e = FP16_EXTRAC_EXP(fp_val); + uint16_t hf_m = FP16_EXTRAC_MAN(fp_val); + + if (FP16_IS_DENORM(fp_val)) { // Denormalized number + return 0; + } + + if (FP16_IS_INVALID(fp_val)) { // Inf or NaN + m_ret = ~0; + } else { + uint64_t long_int_m = hf_m; + uint16_t shift_out = 0; + while (hf_e != kFp16ExpBias) { + if (hf_e > kFp16ExpBias) { + hf_e--; + long_int_m = long_int_m << 1; + } else { + hf_e++; + shift_out++; + } + } + bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen); + m_ret = static_cast((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max); + if (need_round && m_ret != kBitLen16Max) { + m_ret++; + } + } + + if (s_ret == 1) { // Negative number + m_ret = 0; + } + // m_ret equal to final result + return m_ret; +} +/// @ingroup fp16_t math convertion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to int32_t +/// @return Return int32_t value of fp_val which is the value of fp16_t object +static int32_t Fp16ToInt32(const uint16_t &fp_val) { + uint32_t ret_v; + // 1.get s_ret and shift it to bit0. + uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val); + // 2.get hf_e and hf_m + uint16_t hf_e = FP16_EXTRAC_EXP(fp_val); + uint16_t hf_m = FP16_EXTRAC_MAN(fp_val); + + if (FP16_IS_INVALID(fp_val)) { // Inf or NaN + ret_v = kInt32Max + s_ret; + } else { + uint64_t long_int_m = hf_m; + uint16_t shift_out = 0; + + while (hf_e != kFp16ExpBias) { + if (hf_e > kFp16ExpBias) { + hf_e--; + long_int_m = long_int_m << 1; + } else { + hf_e++; + shift_out++; + } + } + bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen); + auto m_ret = static_cast((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max); + if (need_round && m_ret < kInt32Max) { + m_ret++; + } + + if (s_ret == 1) { + m_ret = (~m_ret) + 1; + } + if (m_ret == 0) { + s_ret = 0; + } + // Generate final result + ret_v = (s_ret << kBitShift31) | (m_ret); + } + + return *(reinterpret_cast(&ret_v)); +} +/// @ingroup fp16_t math conversion static method +/// @param [in] fp_val uint16_t value of fp16_t object +/// @brief Convert fp16_t to uint32_t +/// @return Return uint32_t value of fp_val which is the value of fp16_t object +static uint32_t Fp16ToUInt32(const uint16_t &fp_val) { + uint32_t m_ret; + // 1.get s_ret and shift it to bit0. + uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val); + // 2.get hf_e and hf_m + uint16_t hf_e = FP16_EXTRAC_EXP(fp_val); + uint16_t hf_m = FP16_EXTRAC_MAN(fp_val); + + if (FP16_IS_DENORM(fp_val)) { // Denormalized number + return 0u; + } + + if (FP16_IS_INVALID(fp_val)) { // Inf or NaN + m_ret = ~0u; + } else { + uint64_t long_int_m = hf_m; + uint16_t shift_out = 0; + while (hf_e != kFp16ExpBias) { + if (hf_e > kFp16ExpBias) { + hf_e--; + long_int_m = long_int_m << 1; + } else { + hf_e++; + shift_out++; + } + } + bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen); + m_ret = static_cast(long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max; + if (need_round && m_ret != kBitLen32Max) { + m_ret++; + } + } + + if (s_ret == 1) { // Negative number + m_ret = 0; + } + // m_ret equal to final result + return m_ret; +} +static uint16_t Fp16AddCalVal(uint16_t &s_ret, int16_t e_ret, uint16_t m_ret, uint32_t m_trunc, uint16_t shift_out) { + uint16_t m_min = kFp16ManHideBit << shift_out; + uint16_t m_max = m_min << 1; + // Denormal + while (m_ret < m_min && e_ret > 0) { // the value of m_ret should not be smaller than 2^23 + m_ret = m_ret << 1; + m_ret += (kFp32SignMask & m_trunc) >> kFp32SignIndex; + m_trunc = m_trunc << 1; + e_ret = e_ret - 1; + } + while (m_ret >= m_max) { // the value of m_ret should be smaller than 2^24 + m_trunc = m_trunc >> 1; + m_trunc = m_trunc | (kFp32SignMask * (m_ret & 1)); + m_ret = m_ret >> 1; + e_ret = e_ret + 1; + } + + bool b_last_bit = ((m_ret & 1) > 0); + bool b_trunc_high = 0; + bool b_trunc_left = 0; + b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0); + b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0); + m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret, shift_out); + while (m_ret >= m_max) { + m_ret = m_ret >> 1; + e_ret = e_ret + 1; + } + + if (e_ret == 0 && m_ret <= m_max) { + m_ret = m_ret >> 1; + } + Fp16Normalize(e_ret, m_ret); + uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast(e_ret), m_ret); + return ret; +} +/// @ingroup fp16_t math operator +/// @param [in] v_1 left operator value of fp16_t object +/// @param [in] v_2 right operator value of fp16_t object +/// @brief Performing fp16_t addition +/// @return Return fp16_t result of adding this and fp +static uint16_t Fp16Add(uint16_t v_1, uint16_t v_2) { + uint16_t s_a; + uint16_t s_b; + int16_t e_a; + int16_t e_b; + uint32_t m_a; + uint32_t m_b; + uint16_t m_a_tmp; + uint16_t m_b_tmp; + uint16_t shift_out = 0; + // 1.Extract + ExtractFp16(v_1, s_a, e_a, m_a_tmp); + ExtractFp16(v_2, s_b, e_b, m_b_tmp); + m_a = m_a_tmp; + m_b = m_b_tmp; + + uint16_t sum; + uint16_t s_ret; + if (s_a != s_b) { + ReverseMan(s_a > 0, m_a); + ReverseMan(s_b > 0, m_b); + sum = static_cast(GetManSum(e_a, m_a, e_b, m_b)); + s_ret = (sum & kFp16SignMask) >> kFp16SignIndex; + ReverseMan(s_ret > 0, m_a); + ReverseMan(s_ret > 0, m_b); + } else { + sum = static_cast(GetManSum(e_a, m_a, e_b, m_b)); + s_ret = s_a; + } + + if (sum == 0) { + shift_out = 3; // shift to left 3 bits + m_a = m_a << shift_out; + m_b = m_b << shift_out; + } + + uint32_t m_trunc = 0; + int16_t e_ret = std::max(e_a, e_b); + int16_t e_tmp = std::abs(e_a - e_b); + if (e_a > e_b) { + m_trunc = (m_b << (kBitShift32 - static_cast(e_tmp))); + m_b = RightShift(m_b, e_tmp); + } else if (e_a < e_b) { + m_trunc = (m_a << (kBitShift32 - static_cast(e_tmp))); + m_a = RightShift(m_a, e_tmp); + } + // calculate mantissav + auto m_ret = static_cast(m_a + m_b); + return Fp16AddCalVal(s_ret, e_ret, m_ret, m_trunc, shift_out); +} +/// @ingroup fp16_t math operator +/// @param [in] v_1 left operator value of fp16_t object +/// @param [in] v_2 right operator value of fp16_t object +/// @brief Performing fp16_t subtraction +/// @return Return fp16_t result of subtraction fp from this +static uint16_t Fp16Sub(uint16_t v_1, uint16_t v_2) { + // Reverse + uint16_t tmp = ((~(v_2)) & kFp16SignMask) | (v_2 & kFp16AbsMax); + return Fp16Add(v_1, tmp); +} +/// @ingroup fp16_t math operator +/// @param [in] v_1 left operator value of fp16_t object +/// @param [in] v_2 right operator value of fp16_t object +/// @brief Performing fp16_t multiplication +/// @return Return fp16_t result of multiplying this and fp +static uint16_t Fp16Mul(uint16_t v_1, uint16_t v_2) { + uint16_t s_a, s_b; + int16_t e_a, e_b; + uint32_t m_a, m_b; + uint16_t s_ret, m_ret; + int16_t e_ret; + uint32_t mul_m; + uint16_t m_a_tmp, m_b_tmp; + // 1.Extract + ExtractFp16(v_1, s_a, e_a, m_a_tmp); + ExtractFp16(v_2, s_b, e_b, m_b_tmp); + m_a = m_a_tmp; + m_b = m_b_tmp; + + e_ret = e_a + e_b - kFp16ExpBias - kDim10; + mul_m = m_a * m_b; + s_ret = s_a ^ s_b; + + uint32_t m_min = kFp16ManHideBit; + uint32_t m_max = m_min << 1; + uint32_t m_trunc = 0; + // the value of m_ret should not be smaller than 2^23 + while (mul_m < m_min && e_ret > 1) { + mul_m = mul_m << 1; + e_ret = e_ret - 1; + } + while (mul_m >= m_max || e_ret < 1) { + m_trunc = m_trunc >> 1; + m_trunc = m_trunc | (kFp32SignMask * (mul_m & 1)); + mul_m = mul_m >> 1; + e_ret = e_ret + 1; + } + bool b_last_bit = ((mul_m & 1) > 0); + bool b_trunc_high = 0; + bool b_trunc_left = 0; + b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0); + b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0); + mul_m = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, mul_m); + + while (mul_m >= m_max || e_ret < 0) { + mul_m = mul_m >> 1; + e_ret = e_ret + 1; + } + + if (e_ret == 1 && mul_m < kFp16ManHideBit) { + e_ret = 0; + } + m_ret = static_cast(mul_m); + + Fp16Normalize(e_ret, m_ret); + + uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast(e_ret), m_ret); + return ret; +} +/// @ingroup fp16_t math operator divided +/// @param [in] v_1 left operator value of fp16_t object +/// @param [in] v_2 right operator value of fp16_t object +/// @brief Performing fp16_t division +/// @return Return fp16_t result of division this by fp +static uint16_t Fp16Div(uint16_t v_1, uint16_t v_2) { + uint16_t ret; + if (FP16_IS_ZERO(v_2)) { // result is inf + // throw "fp16_t division by zero."; + uint16_t s_a, s_b; + uint16_t s_ret; + s_a = FP16_EXTRAC_SIGN(v_1); + s_b = FP16_EXTRAC_SIGN(v_2); + s_ret = s_a ^ s_b; + ret = FP16_CONSTRUCTOR(s_ret, kFp16MaxExp, 0u); + } else if (FP16_IS_ZERO(v_1)) { + ret = 0u; + } else { + uint16_t s_a, s_b; + int16_t e_a, e_b; + uint64_t m_a, m_b; + float m_div; + uint16_t m_a_tmp, m_b_tmp; + // 1.Extract + ExtractFp16(v_1, s_a, e_a, m_a_tmp); + ExtractFp16(v_2, s_b, e_b, m_b_tmp); + m_a = m_a_tmp; + m_b = m_b_tmp; + + uint64_t m_tmp; + if (e_a > e_b) { + m_tmp = m_a; + uint16_t tmp; + tmp = e_a - e_b; + for (int i = 0; i < tmp; i++) { + m_tmp = m_tmp << 1; + } + m_a = m_tmp; + } else if (e_a < e_b) { + m_tmp = m_b; + uint16_t tmp = e_b - e_a; + for (int i = 0; i < tmp; i++) { + m_tmp = m_tmp << 1; + } + m_b = m_tmp; + } + m_div = static_cast(m_a * 1.0f / m_b); + fp16_t fp_div; + fp_div = m_div; + ret = fp_div.val; + if (s_a != s_b) { + ret |= kFp16SignMask; + } + } + return ret; +} + +// operate +fp16_t fp16_t::operator+(const fp16_t fp) { + uint16_t ret_val = Fp16Add(val, fp.val); + fp16_t ret(ret_val); + return ret; +} +fp16_t fp16_t::operator-(const fp16_t fp) { + uint16_t ret_val = Fp16Sub(val, fp.val); + fp16_t ret(ret_val); + return ret; +} +fp16_t fp16_t::operator*(const fp16_t fp) { + uint16_t ret_val = Fp16Mul(val, fp.val); + fp16_t ret(ret_val); + return ret; +} +fp16_t fp16_t::operator/(const fp16_t fp) { + uint16_t ret_val = Fp16Div(val, fp.val); + fp16_t ret(ret_val); + return ret; +} + +fp16_t fp16_t::operator+=(const fp16_t fp) { + val = Fp16Add(val, fp.val); + return *this; +} +fp16_t fp16_t::operator-=(const fp16_t fp) { + val = Fp16Sub(val, fp.val); + return *this; +} +fp16_t fp16_t::operator*=(const fp16_t fp) { + val = Fp16Mul(val, fp.val); + return *this; +} +fp16_t fp16_t::operator/=(const fp16_t fp) { + val = Fp16Div(val, fp.val); + return *this; +} + +// compare +bool fp16_t::operator==(const fp16_t &fp) const { + bool result = true; + if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) { + result = true; + } else { + result = ((val & kBitLen16Max) == (fp.val & kBitLen16Max)); // bit compare + } + return result; +} +bool fp16_t::operator!=(const fp16_t &fp) const { + bool result = true; + if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) { + result = false; + } else { + result = ((val & kBitLen16Max) != (fp.val & kBitLen16Max)); // bit compare + } + return result; +} +bool fp16_t::operator>(const fp16_t &fp) const { + uint16_t s_a, s_b; + uint16_t e_a, e_b; + uint16_t m_a, m_b; + bool result = true; + + // 1.Extract + s_a = FP16_EXTRAC_SIGN(val); + s_b = FP16_EXTRAC_SIGN(fp.val); + e_a = FP16_EXTRAC_EXP(val); + e_b = FP16_EXTRAC_EXP(fp.val); + m_a = FP16_EXTRAC_MAN(val); + m_b = FP16_EXTRAC_MAN(fp.val); + + // Compare + if ((s_a == 0) && (s_b > 0)) { // + - + // -0=0 + result = !(FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)); + } else if ((s_a == 0) && (s_b == 0)) { // + + + if (e_a > e_b) { // e_a - e_b >= 1; Va always larger than Vb + result = true; + } else if (e_a == e_b) { + result = m_a > m_b; + } else { + result = false; + } + } else if ((s_a > 0) && (s_b > 0)) { // - - opposite to + + + if (e_a < e_b) { + result = true; + } else if (e_a == e_b) { + result = m_a < m_b; + } else { + result = false; + } + } else { // - + + result = false; + } + + return result; +} +bool fp16_t::operator>=(const fp16_t &fp) const { + bool result = true; + if ((*this) > fp) { + result = true; + } else if ((*this) == fp) { + result = true; + } else { + result = false; + } + + return result; +} +bool fp16_t::operator<(const fp16_t &fp) const { + bool result = true; + if ((*this) >= fp) { + result = false; + } else { + result = true; + } + + return result; +} +bool fp16_t::operator<=(const fp16_t &fp) const { + bool result = true; + if ((*this) > fp) { + result = false; + } else { + result = true; + } + + return result; +} + +// evaluation fp16_t &fp16_t::operator=(const fp16_t &fp) { if (&fp == this) { return *this; @@ -100,31 +822,31 @@ fp16_t &fp16_t::operator=(const fp16_t &fp) { val = fp.val; return *this; } - fp16_t &fp16_t::operator=(const float &f_val) { uint16_t s_ret, m_ret; int16_t e_ret; uint32_t e_f, m_f; - uint32_t ui32_v = *(reinterpret_cast(&f_val)); // 1:8:23bit sign:exp:man + const uint32_t ui32_v = *(reinterpret_cast(&f_val)); // 1:8:23bit sign:exp:man uint32_t m_len_delta; - s_ret = static_cast((ui32_v & FP32_SIGN_MASK) >> FP32_SIGN_INDEX); // 4Byte->2Byte - e_f = (ui32_v & FP32_EXP_MASK) >> FP32_MAN_LEN; // 8 bit exponent - m_f = (ui32_v & FP32_MAN_MASK); // 23 bit mantissa dont't need to care about denormal - m_len_delta = FP32_MAN_LEN - FP16_MAN_LEN; + s_ret = static_cast((ui32_v & kFp32SignMask) >> kFp32SignIndex); // 4Byte->2Byte + e_f = (ui32_v & kFp32ExpMask) >> kFp32ManLen; // 8 bit exponent + m_f = (ui32_v & kFp32ManMask); // 23 bit mantissa dont't need to care about denormal + m_len_delta = kFp32ManLen - kFp16ManLen; + bool need_round = false; // Exponent overflow/NaN converts to signed inf/NaN if (e_f > 0x8Fu) { // 0x8Fu:142=127+15 - e_ret = FP16_MAX_EXP - 1; - m_ret = FP16_MAX_MAN; + e_ret = kFp16MaxExp - 1; + m_ret = kFp16MaxMan; } else if (e_f <= 0x70u) { // 0x70u:112=127-15 Exponent underflow converts to denormalized half or signed zero e_ret = 0; if (e_f >= 0x67) { // 0x67:103=127-24 Denormal - m_f = (m_f | FP32_MAN_HIDE_BIT); - uint16_t shift_out = FP32_MAN_LEN; + m_f = (m_f | kFp32ManHideBit); + uint16_t shift_out = kFp32ManLen; uint64_t m_tmp = (static_cast(m_f)) << (e_f - 0x67); - bool need_round = IsRoundOne(m_tmp, shift_out); + need_round = IsRoundOne(m_tmp, shift_out); m_ret = static_cast(m_tmp >> shift_out); if (need_round) { m_ret++; @@ -137,12 +859,12 @@ fp16_t &fp16_t::operator=(const float &f_val) { } else { // Regular case with no overflow or underflow e_ret = static_cast(e_f - 0x70u); - bool need_round = IsRoundOne(m_f, static_cast(m_len_delta)); + need_round = IsRoundOne(m_f, static_cast(m_len_delta)); m_ret = static_cast(m_f >> m_len_delta); if (need_round) { m_ret++; } - if (m_ret & FP16_MAN_HIDE_BIT) { + if (m_ret & kFp16ManHideBit) { e_ret++; } } @@ -151,159 +873,344 @@ fp16_t &fp16_t::operator=(const float &f_val) { val = FP16_CONSTRUCTOR(s_ret, static_cast(e_ret), m_ret); return *this; } +fp16_t &fp16_t::operator=(const int8_t &i_val) { + uint16_t s_ret, e_ret, m_ret; + + s_ret = static_cast(((static_cast(i_val)) & 0x80) >> kDim7); + m_ret = static_cast(((static_cast(i_val)) & kInt8Max)); + + if (m_ret == 0) { + e_ret = 0; + } else { + if (s_ret) { // negative number(<0) + m_ret = static_cast(std::abs(i_val)); // complement + } + + e_ret = kFp16ManLen; + while ((m_ret & kFp16ManHideBit) == 0) { + m_ret = m_ret << 1; + e_ret = e_ret - 1; + } + e_ret = e_ret + kFp16ExpBias; + } + + val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret); + return *this; +} +fp16_t &fp16_t::operator=(const uint8_t &ui_val) { + uint16_t s_ret, e_ret, m_ret; + s_ret = 0; + e_ret = 0; + m_ret = ui_val; + if (m_ret) { + e_ret = kFp16ManLen; + while ((m_ret & kFp16ManHideBit) == 0) { + m_ret = m_ret << 1; + e_ret = e_ret - 1; + } + e_ret = e_ret + kFp16ExpBias; + } + + val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret); + return *this; +} +static void SetValByUint16Val(const uint16_t &input_val, const uint16_t &sign, uint16_t &ret_val) { + uint32_t m_tmp = (input_val & kFp32AbsMax); + uint16_t m_min = kFp16ManHideBit; + uint16_t m_max = m_min << 1; + uint16_t len = static_cast(GetManBitLength(m_tmp)); + if (m_tmp) { + int16_t e_ret; + if (len > kDim11) { + e_ret = kFp16ExpBias + kFp16ManLen; + uint16_t e_tmp = len - kDim11; + uint32_t trunc_mask = 1; + for (int i = 1; i < e_tmp; i++) { + trunc_mask = (trunc_mask << 1) + 1; + } + uint32_t m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp); + for (int i = 0; i < e_tmp; i++) { + m_tmp = (m_tmp >> 1); + e_ret = e_ret + 1; + } + bool b_last_bit = ((m_tmp & 1) > 0); + bool b_trunc_high = 0; + bool b_trunc_left = 0; + if (kRoundToNearest == g_round_mode) { // trunc + b_trunc_high = ((m_trunc & kFp32SignMask) > 0); + b_trunc_left = ((m_trunc & kFp32AbsMax) > 0); + } + m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp); + while (m_tmp >= m_max || e_ret < 0) { + m_tmp = m_tmp >> 1; + e_ret = e_ret + 1; + } + } else { + e_ret = kFp16ExpBias; + m_tmp = m_tmp << (kManBitLength - len); + e_ret = e_ret + (len - 1); + } + auto m_ret = static_cast(m_tmp); + ret_val = FP16_CONSTRUCTOR(sign, static_cast(e_ret), m_ret); + } +} +fp16_t &fp16_t::operator=(const int16_t &i_val) { + if (i_val == 0) { + val = 0; + } else { + uint16_t ui_val = *(reinterpret_cast(&i_val)); + auto s_ret = static_cast(ui_val >> kBitShift15); + if (s_ret) { + int16_t iValM = -i_val; + ui_val = *(reinterpret_cast(&iValM)); + } + SetValByUint16Val(ui_val, s_ret, val); + } + return *this; +} +fp16_t &fp16_t::operator=(const uint16_t &ui_val) { + if (ui_val == 0) { + val = 0; + } else { + int16_t e_ret; + uint16_t m_ret = ui_val; + uint16_t m_min = kFp16ManHideBit; + uint16_t m_max = m_min << 1; + uint16_t len = static_cast(GetManBitLength(m_ret)); + if (len > kManBitLength) { + e_ret = kFp16ExpBias + kFp16ManLen; + uint32_t m_trunc; + uint32_t trunc_mask = 1; + uint16_t e_tmp = len - kManBitLength; + for (int i = 1; i < e_tmp; i++) { + trunc_mask = (trunc_mask << 1) + 1; + } + m_trunc = (m_ret & trunc_mask) << (kBitShift32 - e_tmp); + for (int i = 0; i < e_tmp; i++) { + m_ret = (m_ret >> 1); + e_ret = e_ret + 1; + } + bool b_last_bit = ((m_ret & 1) > 0); + bool b_trunc_high = 0; + bool b_trunc_left = 0; + if (kRoundToNearest == g_round_mode) { // trunc + b_trunc_high = ((m_trunc & kFp32SignMask) > 0); + b_trunc_left = ((m_trunc & kFp32AbsMax) > 0); + } + m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret); + while (m_ret >= m_max || e_ret < 0) { + m_ret = m_ret >> 1; + e_ret = e_ret + 1; + } + if (FP16_IS_INVALID(val)) { + val = kFp16Max; + } + } else { + e_ret = kFp16ExpBias; + m_ret = m_ret << (kDim11 - len); + e_ret = e_ret + (len - 1); + } + val = FP16_CONSTRUCTOR(0u, static_cast(e_ret), m_ret); + } + return *this; +} +static void SetValByUint32Val(const uint32_t &input_val, const uint16_t &sign, uint16_t &ret_val) { + int16_t e_ret; + uint32_t m_tmp = (input_val & kFp32AbsMax); + uint32_t m_min = kFp16ManHideBit; + uint32_t m_max = m_min << 1; + uint16_t len = static_cast(GetManBitLength(m_tmp)); + if (len > kDim11) { + e_ret = kFp16ExpBias + kFp16ManLen; + uint32_t m_trunc = 0; + uint32_t trunc_mask = 1; + uint16_t e_tmp = len - kDim11; + for (int i = 1; i < e_tmp; i++) { + trunc_mask = (trunc_mask << 1) + 1; + } + m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp); + for (int i = 0; i < e_tmp; i++) { + m_tmp = (m_tmp >> 1); + e_ret = e_ret + 1; + } + bool b_last_bit = ((m_tmp & 1) > 0); + bool b_trunc_high = 0; + bool b_trunc_left = 0; + if (kRoundToNearest == g_round_mode) { // trunc + b_trunc_high = ((m_trunc & kFp32SignMask) > 0); + b_trunc_left = ((m_trunc & kFp32AbsMax) > 0); + } + m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp); + while (m_tmp >= m_max || e_ret < 0) { + m_tmp = m_tmp >> 1; + e_ret = e_ret + 1; + } + if (e_ret >= kFp16MaxExp) { + e_ret = kFp16MaxExp - 1; + m_tmp = kFp16MaxMan; + } + } else { + e_ret = kFp16ExpBias; + m_tmp = m_tmp << (kDim11 - len); + e_ret = e_ret + (len - 1); + } + auto m_ret = static_cast(m_tmp); + ret_val = FP16_CONSTRUCTOR(sign, static_cast(e_ret), m_ret); +} fp16_t &fp16_t::operator=(const int32_t &i_val) { if (i_val == 0) { val = 0; } else { uint32_t ui_val = *(reinterpret_cast(&i_val)); - uint16_t s_ret = static_cast(ui_val >> kInt32SymbolShift); + auto s_ret = static_cast(ui_val >> kBitShift31); if (s_ret) { - int32_t i_val_m = -i_val; - ui_val = *(reinterpret_cast(&i_val_m)); + int32_t iValM = -i_val; + ui_val = *(reinterpret_cast(&iValM)); } + SetValByUint32Val(ui_val, s_ret, val); + } + return *this; +} +fp16_t &fp16_t::operator=(const uint32_t &ui_val) { + if (ui_val == 0) { + val = 0; + } else { int16_t e_ret; - uint32_t m_tmp = (ui_val & FP32_ABS_MAX); - uint32_t m_min = FP16_MAN_HIDE_BIT; + uint32_t m_tmp = ui_val; + uint32_t m_min = kFp16ManHideBit; uint32_t m_max = m_min << 1; - int32_t len = static_cast(GetManBitLength(m_tmp)); - if (len > kDim_11) { - e_ret = FP16_EXP_BIAS + FP16_MAN_LEN; + uint16_t len = static_cast(GetManBitLength(m_tmp)); + if (len > kDim11) { + e_ret = kFp16ExpBias + kFp16ManLen; uint32_t m_trunc = 0; uint32_t trunc_mask = 1; - int32_t e_tmp = len - kDim_11; + uint16_t e_tmp = len - kDim11; for (int i = 1; i < e_tmp; i++) { trunc_mask = (trunc_mask << 1) + 1; } - m_trunc = (m_tmp & trunc_mask) << static_cast(kBitShift_32 - e_tmp); - for (int i = 0; i < e_tmp; i++) { + m_trunc = (m_tmp & trunc_mask) << static_cast(kBitShift32 - e_tmp); + for (uint16_t i = 0; i < e_tmp; i++) { m_tmp = (m_tmp >> 1); e_ret = e_ret + 1; } bool b_last_bit = ((m_tmp & 1) > 0); bool b_trunc_high = false; bool b_trunc_left = false; - if (g_round_mode == ROUND_TO_NEAREST) { // trunc - b_trunc_high = ((m_trunc & FP32_SIGN_MASK) > 0); - b_trunc_left = ((m_trunc & FP32_ABS_MAX) > 0); + if (g_round_mode == kRoundToNearest) { // trunc + b_trunc_high = ((m_trunc & kFp32SignMask) > 0); + b_trunc_left = ((m_trunc & kFp32AbsMax) > 0); } m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp); while (m_tmp >= m_max || e_ret < 0) { m_tmp = m_tmp >> 1; e_ret = e_ret + 1; } - if (e_ret >= FP16_MAX_EXP) { - e_ret = FP16_MAX_EXP - 1; - m_tmp = FP16_MAX_MAN; + if (e_ret >= kFp16MaxExp) { + e_ret = kFp16MaxExp - 1; + m_tmp = kFp16MaxMan; } } else { - e_ret = FP16_EXP_BIAS; - m_tmp = m_tmp << static_cast(kDim_11 - len); + e_ret = kFp16ExpBias; + m_tmp = m_tmp << (kDim11 - len); e_ret = e_ret + (len - 1); } - uint16_t m_ret = static_cast(m_tmp); - val = FP16_CONSTRUCTOR(s_ret, static_cast(e_ret), m_ret); + auto m_ret = static_cast(m_tmp); + val = FP16_CONSTRUCTOR(0u, static_cast(e_ret), m_ret); } return *this; } +fp16_t &fp16_t::operator=(const double &d_val) { + uint16_t s_ret; + uint16_t m_ret; + int16_t e_ret; + uint64_t e_d; + uint64_t m_d; + uint64_t ui64_v = *(reinterpret_cast(&d_val)); // 1:11:52bit sign:exp:man + uint32_t m_len_delta; -/// -/// @ingroup fp16_t math conversion static method -/// @param [in] fp_val uint16_t value of fp16_t object -/// @brief Convert fp16_t to float/fp32 -/// @return Return float/fp32 value of fp_val which is the value of fp16_t object -/// -float Fp16ToFloat(const uint16_t &fp_val) { - float ret; - - uint16_t hf_sign, hf_man; - int16_t hf_exp; - ExtractFP16(fp_val, &hf_sign, &hf_exp, &hf_man); - - while (hf_man && !(hf_man & FP16_MAN_HIDE_BIT)) { - hf_man <<= 1; - hf_exp--; - } - - uint32_t s_ret, e_ret, m_ret, f_val; + s_ret = static_cast((ui64_v & kFp64SignMask) >> kFp64SignIndex); // 4Byte + e_d = (ui64_v & kFp64ExpMask) >> kFp64ManLen; // 10 bit exponent + m_d = (ui64_v & kFp64ManMask); // 52 bit mantissa + m_len_delta = kFp64ManLen - kFp16ManLen; - s_ret = hf_sign; - if (!hf_man) { + bool need_round = false; + // Exponent overflow/NaN converts to signed inf/NaN + if (e_d >= 0x410u) { // 0x410:1040=1023+16 + e_ret = kFp16MaxExp - 1; + m_ret = kFp16MaxMan; + val = FP16_CONSTRUCTOR(s_ret, static_cast(e_ret), m_ret); + } else if (e_d <= 0x3F0u) { // Exponent underflow converts to denormalized half or signed zero + // 0x3F0:1008=1023-15 + // Signed zeros, denormalized floats, and floats with small + // exponents all convert to signed zero half precision. e_ret = 0; - m_ret = 0; - } else { - e_ret = static_cast(hf_exp - FP16_EXP_BIAS + FP32_EXP_BIAS); - m_ret = hf_man & FP16_MAN_MASK; - m_ret = m_ret << (FP32_MAN_LEN - FP16_MAN_LEN); - } - f_val = FP32_CONSTRUCTOR(s_ret, e_ret, m_ret); - Fp16ToFloatData data; - data.uint_data = f_val; - ret = data.float_data; - - return ret; -} - -/// -/// @ingroup fp16_t math convertion static method -/// @param [in] fp_val uint16_t value of fp16_t object -/// @brief Convert fp16_t to int32_t -/// @return Return int32_t value of fp_val which is the value of fp16_t object -/// -int32_t Fp16ToInt32(const uint16_t &fp_val) { - int32_t ret; - uint32_t ret_v; - uint32_t s_ret; - uint16_t hf_e, hf_m; - - // 1.Get s_ret and shift it to bit0. - s_ret = FP16_EXTRAC_SIGN(fp_val); - // 2.Get hf_e and hf_m - hf_e = FP16_EXTRAC_EXP(fp_val); - hf_m = FP16_EXTRAC_MAN(fp_val); + if (e_d >= 0x3E7u) { // 0x3E7u:999=1023-24 Denormal + // Underflows to a denormalized value + m_d = (kFp64ManHideBit | m_d); + uint16_t shift_out = kFp64ManLen; + uint64_t m_tmp = (static_cast(m_d)) << (e_d - 0x3E7u); - if (FP16_IS_INVALID(fp_val)) { // Inf or NaN - ret_v = INT32_T_MAX + s_ret; - } else { - uint64_t long_int_m = hf_m; - uint16_t shift_out = 0; - - while (hf_e != FP16_EXP_BIAS) { - if (hf_e > FP16_EXP_BIAS) { - hf_e--; - long_int_m = long_int_m << 1; - } else { - hf_e++; - shift_out++; + need_round = IsRoundOne(m_tmp, shift_out); + m_ret = static_cast(m_tmp >> shift_out); + if (need_round) { + m_ret++; } + } else if (e_d == 0x3E6u && m_d > 0) { + m_ret = 1; + } else { + m_ret = 0; } - uint32_t m_ret; - bool need_round = IsRoundOne(long_int_m, shift_out + FP16_MAN_LEN); - m_ret = static_cast((long_int_m >> (FP16_MAN_LEN + shift_out)) & BIT_LEN32_MAX); - if (need_round && m_ret < INT32_T_MAX) { - m_ret++; - } + } else { // Regular case with no overflow or underflow + e_ret = static_cast(e_d - 0x3F0u); - if (s_ret == 1) { - m_ret = (~m_ret) + 1; + need_round = IsRoundOne(m_d, m_len_delta); + m_ret = static_cast(m_d >> m_len_delta); + if (need_round) { + m_ret++; } - if (m_ret == 0) { - s_ret = 0; + if (m_ret & kFp16ManHideBit) { + e_ret++; } - // Generate final result - ret_v = (s_ret << kInt32SymbolShift) | (m_ret); } - ret = *(reinterpret_cast(&ret_v)); - return ret; + Fp16Normalize(e_ret, m_ret); + val = FP16_CONSTRUCTOR(s_ret, static_cast(e_ret), m_ret); + return *this; } -FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY float fp16_t::toFloat() const { return Fp16ToFloat(val); } - -int32_t fp16_t::toInt32() const { return Fp16ToInt32(val); } - -// Convert +// convert fp16_t::operator float() const { return Fp16ToFloat(val); } - +fp16_t::operator double() const { return Fp16ToDouble(val); } +fp16_t::operator int8_t() const { return Fp16ToInt8(val); } +fp16_t::operator uint8_t() const { return Fp16ToUInt8(val); } +fp16_t::operator int16_t() const { return Fp16ToInt16(val); } +fp16_t::operator uint16_t() const { return Fp16ToUInt16(val); } fp16_t::operator int32_t() const { return Fp16ToInt32(val); } +fp16_t::operator uint32_t() const { return Fp16ToUInt32(val); } +// Cannot be used, just in order to solve the compile error +fp16_t::operator int64_t() const { return 0; } +// Cannot be used, just in order to solve the compile error +fp16_t::operator uint64_t() const { return 0; } + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int fp16_t::IsInf() { + if ((val & kFp16AbsMax) == kFp16ExpMask) { + if (val & kFp16SignMask) { + return -1; + } else { + return 1; + } + } else { + return 0; + } +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY float fp16_t::ToFloat() const { return Fp16ToFloat(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY double fp16_t::ToDouble() const { return Fp16ToDouble(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int8_t fp16_t::ToInt8() const { return Fp16ToInt8(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint8_t fp16_t::ToUInt8() const { return Fp16ToUInt8(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int16_t fp16_t::ToInt16() const { return Fp16ToInt16(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint16_t fp16_t::ToUInt16() const { return Fp16ToUInt16(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int32_t fp16_t::ToInt32() const { return Fp16ToInt32(val); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint32_t fp16_t::ToUInt32() const { return Fp16ToUInt32(val); } } // namespace ge diff --git a/src/ge/common/fp16_t.h b/src/ge/common/fp16_t.h index 4e46c041..34908b95 100644 --- a/src/ge/common/fp16_t.h +++ b/src/ge/common/fp16_t.h @@ -22,298 +22,575 @@ #include namespace ge { -/** - *@ingroup fp16 basic parameter - *@brief fp16 exponent bias - */ -#define FP16_EXP_BIAS (15) -/** - *@ingroup fp16 basic parameter - *@brief the mantissa bit length of fp16 is 10 - */ -#define FP16_MAN_LEN (10) -/** - *@ingroup fp16 basic parameter - *@brief bit index of sign in fp16 - */ -#define FP16_SIGN_INDEX (15) -/** - *@ingroup fp16 basic parameter - *@brief exponent mask of fp16 ( 11111 00000 00000) - */ -#define FP16_EXP_MASK (0x7C00) -/** - *@ingroup fp16 basic parameter - *@brief mantissa mask of fp16 ( 11111 11111) - */ -#define FP16_MAN_MASK (0x03FF) -/** - *@ingroup fp16 basic parameter - *@brief clear bit of mantissa of fp16( 1 00000 00000) - */ -#define FP16_MAN_HIDE_BIT (0x0400) -/** - *@ingroup fp16 basic parameter - *@brief maximum value (0111 1011 1111 1111) - */ -#define FP16_MAX (0x7BFF) -/** - *@ingroup fp16 basic parameter - *@brief maximum value (0111 1011 1111 1111) - */ -/** - *@ingroup fp16 basic parameter - *@brief maximum exponent value of fp16 is 15(11111) - */ -#define FP16_MAX_EXP (0x001F) -/** - *@ingroup fp16 basic parameter - *@brief maximum mantissa value of fp16(11111 11111) - */ -#define FP16_MAX_MAN (0x03FF) -/** - *@ingroup fp16 basic operator - *@brief get sign of fp16 - */ +using DimIndex = enum { + kDim0 = 0, + kDim1, + kDim2, + kDim3, + kDim4, + kDim5, + kDim6, + kDim7, + kDim8, + kDim9, + kDim10, + kDim11, + kDim12, + kDim13, + kDim14, + kDim15, + kDim16, +}; + +using BitShift = enum { + kBitShift2 = 2, + kBitShift3 = 3, + kBitShift4 = 4, + kBitShift5 = 5, + kBitShift6 = 6, + kBitShift7 = 7, + kBitShift8 = 8, + kBitShift9 = 9, + kBitShift10 = 10, + kBitShift11 = 11, + kBitShift12 = 12, + kBitShift13 = 13, + kBitShift14 = 14, + kBitShift15 = 15, + kBitShift16 = 16, + kBitShift20 = 20, + kBitShift24 = 24, + kBitShift27 = 27, + kBitShift28 = 28, + kBitShift31 = 31, + kBitShift32 = 32, + kBitShift36 = 36, + kBitShift40 = 40, + kBitShift44 = 44, + kBitShift48 = 48, + kBitShift52 = 52, + kBitShift56 = 56, + kBitShift59 = 59, + kBitShift60 = 60, + kBitShift63 = 63, + kBitShift64 = 64, + kBitShift128 = 128, + kBitShift255 = 255, + kBitShift256 = 256, + kBitShift512 = 512, + kBitShift768 = 768, + kBitShift784 = 784, + kBitShift1020 = 1020, + kBitShift1024 = 1024, + kBitShift3136 = 3136, + kBitShift4096 = 4096, + kBitShift6144 = 6144, + kBitShift10240 = 10240, + kBitShift65536 = 65536 +}; +/// @ingroup fp16 basic parameter +/// @brief fp16 exponent bias +constexpr uint16_t kFp16ExpBias = 15; +/// @ingroup fp16 basic parameter +/// @brief the exponent bit length of fp16 is 5 +constexpr uint16_t kFp16ExpLen = 5; +/// @ingroup fp16 basic parameter +/// @brief the mantissa bit length of fp16 is 10 +constexpr uint16_t kFp16ManLen = 10; +/// @ingroup fp16 basic parameter +/// @brief bit index of sign in fp16 +constexpr uint16_t kFp16SignIndex = 15; +/// @ingroup fp16 basic parameter +/// @brief sign mask of fp16 (1 00000 00000 00000) +constexpr uint16_t kFp16SignMask = 0x8000; +/// @ingroup fp16 basic parameter +/// @brief exponent mask of fp16 ( 11111 00000 00000) +constexpr uint16_t kFp16ExpMask = 0x7C00; +/// @ingroup fp16 basic parameter +/// @brief mantissa mask of fp16 ( 11111 11111) +constexpr uint16_t kFp16ManMask = 0x03FF; +/// @ingroup fp16 basic parameter +/// @brief hide bit of mantissa of fp16( 1 00000 00000) +constexpr uint16_t kFp16ManHideBit = 0x0400; +/// @ingroup fp16 basic parameter +/// @brief maximum value (0111 1011 1111 1111) +constexpr uint16_t kFp16Max = 0x7BFF; +/// @ingroup fp16 basic parameter +/// @brief minimum value (1111 1011 1111 1111) +constexpr uint16_t kFp16Min = 0xFBFF; +/// @ingroup fp16 basic parameter +/// @brief absolute maximum value (0111 1111 1111 1111) +constexpr uint16_t kFp16AbsMax = 0x7FFF; +/// @ingroup fp16 basic parameter +/// @brief maximum exponent value of fp16 is 15(11111) +constexpr uint16_t kFp16MaxExp = 0x001F; +/// @ingroup fp16 basic parameter +/// @brief maximum valid exponent value of fp16 is 14(11110) +constexpr uint16_t kFp16MaxValidExp = 0x001E; +/// @ingroup fp16 basic parameter +/// @brief maximum mantissa value of fp16(11111 11111) +constexpr uint16_t kFp16MaxMan = 0x03FF; +/// @ingroup fp16 basic parameter +/// @brief absolute minimum normal value of fp16 +/// (E=1,M=0 D=2^(-14)=0.00006103515625) +constexpr uint16_t kFp16MinNormal = 1.0f / (2 << 14); +/// @ingroup fp16 basic operator +/// @brief get sign of fp16 #define FP16_EXTRAC_SIGN(x) (((x) >> 15) & 1) -/** - *@ingroup fp16 basic operator - *@brief get exponent of fp16 - */ -#define FP16_EXTRAC_EXP(x) (((x) >> 10) & FP16_MAX_EXP) -/** - *@ingroup fp16 basic operator - *@brief get mantissa of fp16 - */ -#define FP16_EXTRAC_MAN(x) ((x & 0x3FF) | (((((x) >> 10) & 0x1F) > 0 ? 1 : 0) * 0x400)) -/** - *@ingroup fp16 basic operator - *@brief constructor of fp16 from sign exponent and mantissa - */ -#define FP16_CONSTRUCTOR(s, e, m) (((s) << FP16_SIGN_INDEX) | ((e) << FP16_MAN_LEN) | ((m)&FP16_MAX_MAN)) -/** - *@ingroup fp16 special value judgment - *@brief whether a fp16 is invalid - */ -#define FP16_IS_INVALID(x) ((x & FP16_EXP_MASK) == FP16_EXP_MASK) +/// @ingroup fp16 basic operator +/// @brief get exponent of fp16 +#define FP16_EXTRAC_EXP(x) (((x) >> 10) & kFp16MaxExp) +/// @ingroup fp16 basic operator +/// @brief get mantissa of fp16 +#define FP16_EXTRAC_MAN(x) ((((x) >> 0) & 0x3FF) | (((((x) >> 10) & 0x1F) > 0 ? 1 : 0) * 0x400)) +/// @ingroup fp16 basic operator +/// @brief constructor of fp16 from sign exponent and mantissa +#define FP16_CONSTRUCTOR(s, e, m) (((s) << kFp16SignIndex) | ((e) << kFp16ManLen) | ((m)&kFp16MaxMan)) +/// @ingroup fp16 special value judgment +/// @brief whether a fp16 is zero +#define FP16_IS_ZERO(x) (((x)&kFp16AbsMax) == 0) +/// @ingroup fp16 special value judgment +/// @brief whether a fp16 is a denormalized value +#define FP16_IS_DENORM(x) ((((x)&kFp16ExpMask) == 0)) +/// @ingroup fp16 special value judgment +/// @brief whether a fp16 is infinite +#define FP16_IS_INF(x) (((x)&kFp16AbsMax) == kFp16ExpMask) +/// @ingroup fp16 special value judgment +/// @brief whether a fp16 is NaN +#define FP16_IS_NAN(x) (((x & kFp16ExpMask) == kFp16ExpMask) && (x & kFp16ManMask)) +/// @ingroup fp16 special value judgment +/// @brief whether a fp16 is invalid +#define FP16_IS_INVALID(x) ((x & kFp16ExpMask) == kFp16ExpMask) +/// @ingroup fp32 basic parameter +/// @brief fp32 exponent bias +constexpr uint16_t kFp32ExpBias = 127; +/// @ingroup fp32 basic parameter +/// @brief the exponent bit length of float/fp32 is 8 +constexpr uint16_t kFp32ExpLen = 8; +/// @ingroup fp32 basic parameter +/// @brief the mantissa bit length of float/fp32 is 23 +constexpr uint16_t kFp32ManLen = 23; +/// @ingroup fp32 basic parameter +/// @brief bit index of sign in float/fp32 +constexpr uint16_t kFp32SignIndex = 31; +/// @ingroup fp32 basic parameter +/// @brief sign mask of fp32 (1 0000 0000 0000 0000 0000 0000 000) +constexpr uint32_t kFp32SignMask = 0x80000000u; +/// @ingroup fp32 basic parameter +/// @brief exponent mask of fp32 ( 1111 1111 0000 0000 0000 0000 000) +constexpr uint32_t kFp32ExpMask = 0x7F800000u; +/// @ingroup fp32 basic parameter +/// @brief mantissa mask of fp32 ( 1111 1111 1111 1111 111) +constexpr uint32_t kFp32ManMask = 0x007FFFFFu; +/// @ingroup fp32 basic parameter +/// @brief hide bit of mantissa of fp32 ( 1 0000 0000 0000 0000 000) +constexpr uint32_t kFp32ManHideBit = 0x00800000u; +/// @ingroup fp32 basic parameter +/// @brief absolute maximum value (0 1111 1111 1111 1111 1111 1111 111) +constexpr uint32_t kFp32AbsMax = 0x7FFFFFFFu; +/// @ingroup fp32 basic parameter +/// @brief maximum exponent value of fp32 is 255(1111 1111) +constexpr uint32_t kFp32MaxExp = 0xFF; +/// @ingroup fp32 basic parameter +/// @brief maximum mantissa value of fp32 (1111 1111 1111 1111 1111 111) +constexpr uint32_t kFp32MaxMan = 0x7FFFFF; +/// @ingroup fp32 special value judgment +/// @brief whether a fp32 is NaN +#define FP32_IS_NAN(x) (((x & kFp32ExpMask) == kFp32ExpMask) && (x & kFp32ManMask)) +/// @ingroup fp32 special value judgment +/// @brief whether a fp32 is infinite +#define FP32_IS_INF(x) (((x & kFp32ExpMask) == kFp32ExpMask) && (!(x & kFp32ManMask))) +/// @ingroup fp32 special value judgment +/// @brief whether a fp32 is a denormalized value +#define FP32_IS_DENORM(x) ((((x)&kFp32ExpMask) == 0)) +/// @ingroup fp32 basic operator +/// @brief get sign of fp32 +#define FP32_EXTRAC_SIGN(x) (((x) >> kFp32SignIndex) & 1) +/// @ingroup fp32 basic operator +/// @brief get exponent of fp16 +#define FP32_EXTRAC_EXP(x) (((x)&kFp32ExpMask) >> kFp32ManLen) +/// @ingroup fp32 basic operator +/// @brief get mantissa of fp16 +#define FP32_EXTRAC_MAN(x) (((x)&kFp32ManMask) | (((((x) >> kFp32ManLen) & kFp32MaxExp) > 0 ? 1 : 0) * kFp32ManHideBit)) +/// @ingroup fp32 basic operator +/// @brief constructor of fp32 from sign exponent and mantissa +#define FP32_CONSTRUCTOR(s, e, m) (((s) << kFp32SignIndex) | ((e) << kFp32ManLen) | ((m)&kFp32MaxMan)) +/// @ingroup fp64 basic parameter +/// @brief fp64 exponent bias +constexpr uint16_t kFp64ExpBias = 1023; +/// @ingroup fp64 basic parameter +/// @brief the exponent bit length of double/fp64 is 11 +constexpr uint16_t kFp64ExpLen = 11; +/// @ingroup fp64 basic parameter +/// @brief the mantissa bit length of double/fp64 is 52 +constexpr uint16_t kFp64ManLen = 52; +/// @ingroup fp64 basic parameter +/// @brief bit index of sign in double/fp64 is 63 +constexpr uint16_t kFp64SignIndex = 63; +/// @ingroup fp64 basic parameter +/// @brief sign mask of fp64 (1 000 (total 63bits 0)) +constexpr uint64_t kFp64SignMask = 0x8000000000000000LLu; +/// @ingroup fp64 basic parameter +/// @brief exponent mask of fp64 (0 1 11111 11111 0000?-?-(total 52bits 0)) +constexpr uint64_t kFp64ExpMask = 0x7FF0000000000000LLu; +/// @ingroup fp64 basic parameter +/// @brief mantissa mask of fp64 ( 1111?-?-(total 52bits 1)) +constexpr uint64_t kFp64ManMask = 0x000FFFFFFFFFFFFFLLu; +/// @ingroup fp64 basic parameter +/// @brief hide bit of mantissa of fp64 ( 1 0000?-?-(total 52bits 0)) +constexpr uint64_t kFp64ManHideBit = 0x0010000000000000LLu; +/// @ingroup fp64 basic parameter +/// @brief absolute maximum value (0 111?-?-(total 63bits 1)) +constexpr uint64_t kFp64AbsMax = 0x7FFFFFFFFFFFFFFFLLu; +/// @ingroup fp64 basic parameter +/// @brief maximum exponent value of fp64 is 2047(1 11111 11111) +constexpr uint64_t kFp64MaxExp = 0x07FF; +/// @ingroup fp64 basic parameter +/// @brief maximum mantissa value of fp64 (111?-?-(total 52bits 1)) +constexpr uint64_t kFp64MaxMan = 0xFFFFFFFFFFFLLu; +/// @ingroup fp64 special value judgment +/// @brief whether a fp64 is NaN +#define FP64_IS_NAN(x) (((x & kFp64ExpMask) == kFp64ExpMask) && (x & kFp64ManMask)) +/// @ingroup fp64 special value judgment +/// @brief whether a fp64 is infinite +#define FP64_IS_INF(x) (((x & kFp64ExpMask) == kFp64ExpMask) && (!(x & kFp64ManMask))) +/// @ingroup integer special value judgment +/// @brief maximum positive value of int8_t (0111 1111) +constexpr int8_t kInt8Max = 0x7F; +/// @ingroup integer special value judgment +/// @brief maximum value of a data with 8 bits length (1111 111) +constexpr uint8_t kBitLen8Max = 0xFF; +/// @ingroup integer special value judgment +/// @brief maximum positive value of int16_t (0111 1111 1111 1111) +constexpr int16_t kInt16Max = 0x7FFF; +/// @ingroup integer special value judgment +/// @brief maximum value of a data with 16 bits length (1111 1111 1111 1111) +constexpr uint16_t kBitLen16Max = 0xFFFF; +/// @ingroup integer special value judgment +/// @brief maximum positive value of int32_t (0111 1111 1111 1111 1111 1111 1111 1111) +constexpr int32_t kInt32Max = 0x7FFFFFFFu; +/// @ingroup integer special value judgment +/// @brief maximum value of a data with 32 bits length (1111 1111 1111 1111 1111 1111 1111 1111) +constexpr uint32_t kBitLen32Max = 0xFFFFFFFFu; +/// @ingroup integer special value judgment +/// @brief maximum positive value of int64_t +/// (0111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111) +constexpr int64_t kInt64Max = 0x7FFFFFFFFFFFFFFFu; +/// @ingroup integer special value judgment +/// @brief maximum value of a data with 64 bits length +/// (1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111 1111) +constexpr uint64_t kBitLen64Max = 0xFFFFFFFFFFFFFFFFu; -/** - *@ingroup fp32 basic parameter - *@brief fp32 exponent bias - */ -#define FP32_EXP_BIAS (127) -/** - *@ingroup fp32 basic parameter - *@brief the mantissa bit length of float/fp32 is 23 - */ -#define FP32_MAN_LEN (23) -/** - *@ingroup fp32 basic parameter - *@brief bit index of sign in float/fp32 - */ -#define FP32_SIGN_INDEX (31) -/** - *@ingroup fp32 basic parameter - *@brief sign mask of fp32 (1 0000 0000 0000 0000 0000 0000 000) - */ -#define FP32_SIGN_MASK (0x80000000u) -/** - *@ingroup fp32 basic parameter - *@brief exponent mask of fp32 ( 1111 1111 0000 0000 0000 0000 000) - */ -#define FP32_EXP_MASK (0x7F800000u) -/** - *@ingroup fp32 basic parameter - *@brief mantissa mask of fp32 ( 1111 1111 1111 1111 111) - */ -#define FP32_MAN_MASK (0x007FFFFFu) -/** - *@ingroup fp32 basic parameter - *@brief hide bit of mantissa of fp32 ( 1 0000 0000 0000 0000 000) - */ -#define FP32_MAN_HIDE_BIT (0x00800000u) -/** - *@ingroup fp32 basic parameter - *@brief absolute maximum value (0 1111 1111 1111 1111 1111 1111 111) - */ -#define FP32_ABS_MAX (0x7FFFFFFFu) -/** - *@ingroup fp32 basic parameter - *@brief maximum mantissa value of fp32 (1111 1111 1111 1111 1111 111) - */ -#define FP32_MAX_MAN (0x7FFFFF) -/** - *@ingroup fp32 basic operator - *@brief constructor of fp32 from sign exponent and mantissa - */ -#define FP32_CONSTRUCTOR(s, e, m) (((s) << FP32_SIGN_INDEX) | ((e) << FP32_MAN_LEN) | ((m)&FP32_MAX_MAN)) -/** - *@ingroup fp64 basic parameter - *@brief the mantissa bit length of double/fp64 is 52 - */ -#define FP64_MAN_LEN (52) -/** - *@ingroup fp64 basic parameter - *@brief bit index of sign in double/fp64 is 63 - */ -#define FP64_SIGN_INDEX (63) -/** - *@ingroup fp64 basic parameter - *@brief sign mask of fp64 (1 000 (total 63bits 0)) - */ -#define FP64_SIGN_MASK (0x8000000000000000LLu) -/** - *@ingroup fp64 basic parameter - *@brief exponent mask of fp64 (0 1 11111 11111 0000?-?-(total 52bits 0)) - */ -#define FP64_EXP_MASK (0x7FF0000000000000LLu) -/** - *@ingroup fp64 basic parameter - *@brief mantissa mask of fp64 ( 1111?-?-(total 52bits 1)) - */ -#define FP64_MAN_MASK (0x000FFFFFFFFFFFFFLLu) -/** - *@ingroup fp64 basic parameter - *@brief hide bit of mantissa of fp64 ( 1 0000?-?-(total 52bits 0)) - */ -#define FP64_MAN_HIDE_BIT (0x0010000000000000LLu) -/** - *@ingroup integer special value judgment - *@brief maximum positive value of int8_t (0111 1111) - */ -#define INT8_T_MAX (0x7F) -/** - *@ingroup integer special value judgment - *@brief maximum positive value of int32_t (0111 1111 1111 1111 1111 1111 1111 1111) - */ -#define INT32_T_MAX (0x7FFFFFFFu) -/** - *@ingroup integer special value judgment - *@brief maximum value of a data with 32 bits length (1111 1111 1111 1111 1111 1111 1111 1111) - */ -#define BIT_LEN32_MAX (0xFFFFFFFFu) -/** - *@ingroup fp16_t enum - *@brief round mode of last valid digital - */ -typedef enum TagFp16RoundMode { - ROUND_TO_NEAREST = 0, /**< round to nearest even */ - ROUND_BY_TRUNCATED, /**< round by truncated */ - ROUND_MODE_RESERVED, -} fp16RoundMode_t; +/// @ingroup fp16_t enum +/// @brief round mode of last valid digital +enum TagFp16RoundMode { + kRoundToNearest = 0, // < round to nearest even + kRoundByTruncated, // < round by truncated + kRoundModeReserved, +}; -/** - *@ingroup fp16_t - *@brief Half precision float - * bit15: 1 bit SIGN +---+-----+------------+ - * bit14-10: 5 bit EXP | S |EEEEE|MM MMMM MMMM| - * bit0-9: 10bit MAN +---+-----+------------+ - * - */ +/// @ingroup fp16_t +/// @brief Half precision float +/// bit15: 1 bit SIGN +---+-----+------------+ +/// bit14-10: 5 bit EXP | S |EEEEE|MM MMMM MMMM| +/// bit0-9: 10bit MAN +---+-----+------------+ using fp16_t = struct TagFp16 { uint16_t val; public: - /** - *@ingroup fp16_t constructor - *@brief Constructor without any param(default constructor) - */ + /// @ingroup fp16_t constructor + /// @brief Constructor without any param(default constructor) TagFp16(void) { val = 0x0u; } - /** - *@ingroup fp16_t constructor - *@brief Constructor with an uint16_t value - */ + /// @ingroup fp16_t constructor + /// @brief Constructor with an uint16_t value TagFp16(const uint16_t &ui_val) : val(ui_val) {} - /** - *@ingroup fp16_t constructor - *@brief Constructor with a fp16_t object(copy constructor) - */ + /// @ingroup fp16_t constructor + /// @brief Constructor with a fp16_t object(copy constructor) TagFp16(const TagFp16 &fp) : val(fp.val) {} - /** - *@ingroup fp16_t copy assign - *@brief copy assign - */ + /// @ingroup fp16_t math operator + /// @param [in] fp fp16_t object to be added + /// @brief Override addition operator to performing fp16_t addition + /// @return Return fp16_t result of adding this and fp + TagFp16 operator+(const TagFp16 fp); + /// @ingroup fp16_t math operator + /// @param [in] fp fp16_t object to be subtracted + /// @brief Override addition operator to performing fp16_t subtraction + /// @return Return fp16_t result of subtraction fp from this + TagFp16 operator-(const TagFp16 fp); + /// @ingroup fp16_t math operator + /// @param [in] fp fp16_t object to be multiplied + /// @brief Override multiplication operator to performing fp16_t multiplication + /// @return Return fp16_t result of multiplying this and fp + TagFp16 operator*(const TagFp16 fp); + /// @ingroup fp16_t math operator divided + /// @param [in] fp fp16_t object to be divided + /// @brief Override division operator to performing fp16_t division + /// @return Return fp16_t result of division this by fp + TagFp16 operator/(const TagFp16 fp); + /// @ingroup fp16_t math operator + /// @param [in] fp fp16_t object to be added + /// @brief Override addition operator to performing fp16_t addition + /// @return Return fp16_t result of adding this and fp + TagFp16 operator+=(const TagFp16 fp); + /// @ingroup fp16_t math operator + /// @param [in] fp fp16_t object to be subtracted + /// @brief Override addition operator to performing fp16_t subtraction + /// @return Return fp16_t result of subtraction fp from this + TagFp16 operator-=(const TagFp16 fp); + /// @ingroup fp16_t math operator + /// @param [in] fp fp16_t object to be multiplied + /// @brief Override multiplication operator to performing fp16_t multiplication + /// @return Return fp16_t result of multiplying this and fp + TagFp16 operator*=(const TagFp16 fp); + /// @ingroup fp16_t math operator divided + /// @param [in] fp fp16_t object to be divided + /// @brief Override division operator to performing fp16_t division + /// @return Return fp16_t result of division this by fp + TagFp16 operator/=(const TagFp16 fp); + + /// @ingroup fp16_t math compare operator + /// @param [in] fp fp16_t object to be compared + /// @brief Override basic comparison operator to performing fp16_t if-equal comparison + /// @return Return boolean result of if-equal comparison of this and fp. + bool operator==(const TagFp16 &fp) const; + /// @ingroup fp16_t math compare operator + /// @param [in] fp fp16_t object to be compared + /// @brief Override basic comparison operator to performing fp16_t not-equal comparison + /// @return Return boolean result of not-equal comparison of this and fp. + bool operator!=(const TagFp16 &fp) const; + /// @ingroup fp16_t math compare operator + /// @param [in] fp fp16_t object to be compared + /// @brief Override basic comparison operator to performing fp16_t greater-than comparison + /// @return Return boolean result of greater-than comparison of this and fp. + bool operator>(const TagFp16 &fp) const; + /// @ingroup fp16_t math compare operator + /// @param [in] fp fp16_t object to be compared + /// @brief Override basic comparison operator to performing fp16_t greater-equal comparison + /// @return Return boolean result of greater-equal comparison of this and fp. + bool operator>=(const TagFp16 &fp) const; + /// @ingroup fp16_t math compare operator + /// @param [in] fp fp16_t object to be compared + /// @brief Override basic comparison operator to performing fp16_t less-than comparison + /// @return Return boolean result of less-than comparison of this and fp. + bool operator<(const TagFp16 &fp) const; + /// @ingroup fp16_t math compare operator + /// @param [in] fp fp16_t object to be compared + /// @brief Override basic comparison operator to performing fp16_t less-equal comparison + /// @return Return boolean result of less-equal comparison of this and fp. + bool operator<=(const TagFp16 &fp) const; + + /// @ingroup fp16_t math evaluation operator + /// @param [in] fp fp16_t object to be copy to fp16_t + /// @brief Override basic evaluation operator to copy fp16_t to a new fp16_t + /// @return Return fp16_t result from fp TagFp16 &operator=(const TagFp16 &fp); - /** - *@ingroup fp16_t math evaluation operator - *@param [in] fVal float object to be converted to fp16_t - *@brief Override basic evaluation operator to convert float to fp16_t - *@return Return fp16_t result from fVal - */ - TagFp16 &operator=(const float &fVal); - /** - *@ingroup fp16_t math evaluation operator - *@param [in] iVal int32_t object to be converted to fp16_t - *@brief Override basic evaluation operator to convert int32_t to fp16_t - *@return Return fp16_t result from iVal - */ - TagFp16 &operator=(const int32_t &iVal); - /** - *@ingroup fp16_t math conversion - *@brief Override convert operator to convert fp16_t to float/fp32 - *@return Return float/fp32 value of fp16_t - */ + /// @ingroup fp16_t math evaluation operator + /// @param [in] f_val float object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert float to fp16_t + /// @return Return fp16_t result from f_val + TagFp16 &operator=(const float &f_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] d_val double object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert double to fp16_t + /// @return Return fp16_t result from d_val + TagFp16 &operator=(const double &d_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] i_val float object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert float to fp16_t + /// @return Return fp16_t result from i_val + TagFp16 &operator=(const int8_t &i_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] ui_val uint8_t object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert uint8_t to fp16_t + /// @return Return fp16_t result from ui_val + TagFp16 &operator=(const uint8_t &ui_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] i_val int16_t object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert int16_t to fp16_t + /// @return Return fp16_t result from i_val + TagFp16 &operator=(const int16_t &i_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] ui_val uint16_t object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert uint16_t to fp16_t + /// @return Return fp16_t result from ui_val + TagFp16 &operator=(const uint16_t &ui_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] i_val int32_t object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert int32_t to fp16_t + /// @return Return fp16_t result from i_val + TagFp16 &operator=(const int32_t &i_val); + /// @ingroup fp16_t math evaluation operator + /// @param [in] ui_val uint32_t object to be converted to fp16_t + /// @brief Override basic evaluation operator to convert uint32_t to fp16_t + /// @return Return fp16_t result from ui_val + TagFp16 &operator=(const uint32_t &ui_val); + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to float/fp32 + /// @return Return float/fp32 value of fp16_t operator float() const; - - /** - *@ingroup fp16_t math conversion - *@brief Override convert operator to convert fp16_t to int32_t - *@return Return int32_t value of fp16_t - */ + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to double/fp64 + /// @return Return double/fp64 value of fp16_t + operator double() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to int8_t + /// @return Return int8_t value of fp16_t + operator int8_t() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to uint8_t + /// @return Return uint8_t value of fp16_t + operator uint8_t() const; + /// @ingroup fp16_t conversion + /// @brief Override convert operator to convert fp16_t to int16_t + /// @return Return int16_t value of fp16_t + operator int16_t() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to uint16_t + /// @return Return uint16_t value of fp16_t + operator uint16_t() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to int32_t + /// @return Return int32_t value of fp16_t operator int32_t() const; - - /** - *@ingroup fp16_t math conversion - *@brief Convert fp16_t to float/fp32 - *@return Return float/fp32 value of fp16_t - */ - float toFloat() const; - - /** - *@ingroup fp16_t math conversion - *@brief Convert fp16_t to int32_t - *@return Return int32_t value of fp16_t - */ - int32_t toInt32() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to uint32_t + /// @return Return uint32_t value of fp16_t + operator uint32_t() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to int64_t + /// @return Return int64_t value of fp16_t + operator int64_t() const; + /// @ingroup fp16_t math conversion + /// @brief Override convert operator to convert fp16_t to uint64_t + /// @return Return uint64_t value of fp16_t + operator uint64_t() const; + /// @ingroup fp16_t judgment method + /// @param [in] fp fp16_t object to be judgement + /// @brief whether a fp16_t is inifinite + /// @return Returns 1:+INF -1:-INF 0:not INF + int IsInf(); + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to float/fp32 + /// @return Return float/fp32 value of fp16_t + float ToFloat() const; + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to double/fp64 + /// @return Return double/fp64 value of fp16_t + double ToDouble() const; + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to int8_t + /// @return Return int8_t value of fp16_t + int8_t ToInt8() const; + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to uint8_t + /// @return Return uint8_t value of fp16_t + uint8_t ToUInt8() const; + /// @ingroup fp16_t conversion + /// @brief Convert fp16_t to int16_t + /// @return Return int16_t value of fp16_t + int16_t ToInt16() const; + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to uint16_t + /// @return Return uint16_t value of fp16_t + uint16_t ToUInt16() const; + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to int32_t + /// @return Return int32_t value of fp16_t + int32_t ToInt32() const; + /// @ingroup fp16_t math conversion + /// @brief Convert fp16_t to uint32_t + /// @return Return uint32_t value of fp16_t + uint32_t ToUInt32() const; }; -inline bool operator>(const TagFp16 &lhs, const TagFp16 &rhs) { return lhs.toFloat() > rhs.toFloat(); } -inline bool operator<(const TagFp16 &lhs, const TagFp16 &rhs) { return lhs.toFloat() < rhs.toFloat(); } -inline bool operator==(const TagFp16 &lhs, const TagFp16 &rhs) { return lhs.toFloat() == rhs.toFloat(); } -inline bool operator!=(const TagFp16 &lhs, const TagFp16 &rhs) { return lhs.toFloat() != rhs.toFloat(); } - -/** - *@ingroup fp16_t public method - *@param [in] val signature is negative - *@param [in|out] s sign of fp16_t object - *@param [in|out] e exponent of fp16_t object - *@param [in|out] m mantissa of fp16_t object - *@brief Extract the sign, exponent and mantissa of a fp16_t object - */ -void ExtractFP16(const uint16_t &val, uint16_t *s, int16_t *e, uint16_t *m); -/** - *@ingroup fp16_t public method - *@param [in] bit0 whether the last preserved bit is 1 before round - *@param [in] bit1 whether the abbreviation's highest bit is 1 - *@param [in] bit_left whether the abbreviation's bits which not contain highest bit grater than 0 - *@param [in] man mantissa of a fp16_t or float number, support types: uint16_t/uint32_t/uint64_t - *@param [in] shift abbreviation bits - *@brief Round fp16_t or float mantissa to nearest value - *@return Returns true if round 1,otherwise false; - */ +/// @ingroup fp16_t public method +/// @param [in] val signature is negative +/// @param [in|out] s sign of fp16_t object +/// @param [in|out] e exponent of fp16_t object +/// @param [in|out] m mantissa of fp16_t object +/// @brief Extract the sign, exponent and mantissa of a fp16_t object +void ExtractFp16(const uint16_t &val, uint16_t &s, int16_t &e, uint16_t &m); +/// @ingroup fp16_t public method +/// @param [in] negative sign is negative +/// @param [in|out] man mantissa to be reverse +/// @brief Calculate a mantissa's complement (add ont to it's radix-minus-one complement) +/// @return Return complement of man +template +void ReverseMan(bool negative, T &man) { + if (negative) { + man = (~(man)) + 1; + } +} +/// @ingroup fp16_t public method +/// @param [in] e_a exponent of one fp16_t/float number +/// @param [in] m_a mantissa of one fp16_t/float number +/// @param [in] e_b exponent of another fp16_t/float number +/// @param [in] m_b mantissa of another fp16_t/float number +/// @brief choose mantissa to be shift right whoes exponent is less than another one +/// @return Return mantissawhoes exponent is less than another one template -T ManRoundToNearest(bool bit0, bool bit1, bool bit_left, T man, uint16_t shift = 0) { - man = (man >> shift) + ((bit1 && (bit_left || bit0)) ? 1 : 0); +T MinMan(const int16_t &e_a, T &m_a, const int16_t &e_b, T &m_b) { + return (e_a > e_b) ? m_b : m_a; +} +/// @ingroup fp16_t public method +/// @param [in] man mantissa to be operate +/// @param [in] shift right shift bits +/// @brief right shift a mantissa +/// @return Return right-shift mantissa +template +T RightShift(T man, int16_t shift) { + int bits = sizeof(T) * 8; // one byte have 8 bits + T mask = (((T)1u) << ((unsigned int)(bits - 1))); + for (int i = 0; i < shift; i++) { + man = ((man & mask) | (man >> 1)); + } return man; } - -/** - *@ingroup fp16_t public method - *@param [in] man mantissa of a float number, support types: uint16_t/uint32_t/uint64_t - *@brief Get bit length of a uint32_t number - *@return Return bit length of man - */ +/// @ingroup fp16_t public method +/// @param [in] e_a exponent of one temp fp16_t number +/// @param [in] m_a mantissa of one temp fp16_t number +/// @param [in] e_b exponent of another temp fp16_t number +/// @param [in] m_b mantissa of another temp fp16_t number +/// @brief Get mantissa sum of two temp fp16_t numbers, T support types: uint16_t/uint32_t/uint64_t +/// @return Return mantissa sum +template +T GetManSum(int16_t e_a, const T &m_a, int16_t e_b, const T &m_b) { + T sum = 0; + if (e_a != e_b) { + T m_tmp = 0; + int16_t e_tmp = std::abs(e_a - e_b); + if (e_a > e_b) { + m_tmp = m_b; + m_tmp = RightShift(m_tmp, e_tmp); + sum = m_a + m_tmp; + } else { + m_tmp = m_a; + m_tmp = RightShift(m_tmp, e_tmp); + sum = m_tmp + m_b; + } + } else { + sum = m_a + m_b; + } + return sum; +} +/// @ingroup fp16_t public method +/// @param [in] bit0 whether the last preserved bit is 1 before round +/// @param [in] bit1 whether the abbreviation's highest bit is 1 +/// @param [in] bitLeft whether the abbreviation's bits which not contain highest bit grater than 0 +/// @param [in] man mantissa of a fp16_t or float number, support types: uint16_t/uint32_t/uint64_t +/// @param [in] shift abbreviation bits +/// @brief Round fp16_t or float mantissa to nearest value +/// @return Returns true if round 1,otherwise false; +template +T ManRoundToNearest(bool bit0, bool bit1, bool bitLeft, T man, uint16_t shift = 0) { + man = (man >> shift) + ((bit1 && (bitLeft || bit0)) ? 1 : 0); + return man; +} +/// @ingroup fp16_t public method +/// @param [in] man mantissa of a float number, support types: uint16_t/uint32_t/uint64_t +/// @brief Get bit length of a uint32_t number +/// @return Return bit length of man template int16_t GetManBitLength(T man) { int16_t len = 0; @@ -323,6 +600,5 @@ int16_t GetManBitLength(T man) { } return len; } -}; // namespace ge - +}; // namespace ge #endif // GE_COMMON_FP16_T_H_ diff --git a/src/ge/common/ge/plugin_manager.cc b/src/ge/common/ge/plugin_manager.cc index b41afc5e..29cb8a83 100644 --- a/src/ge/common/ge/plugin_manager.cc +++ b/src/ge/common/ge/plugin_manager.cc @@ -27,6 +27,7 @@ #include #include "framework/common/debug/log.h" +#include "framework/common/util.h" namespace ge { static const int kMaxNumOfSo = 64; diff --git a/src/ge/common/ge_format_util.cc b/src/ge/common/ge_format_util.cc index 8b917db0..d0240224 100644 --- a/src/ge/common/ge_format_util.cc +++ b/src/ge/common/ge_format_util.cc @@ -15,7 +15,6 @@ */ #include "framework/common/ge_format_util.h" - #include "formats/formats.h" namespace ge { diff --git a/src/ge/common/helper/model_helper.cc b/src/ge/common/helper/model_helper.cc index 4026fab1..3f1c3f58 100644 --- a/src/ge/common/helper/model_helper.cc +++ b/src/ge/common/helper/model_helper.cc @@ -17,9 +17,9 @@ #include "framework/common/helper/model_helper.h" #include "common/ge/ge_util.h" -#include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" #include "framework/common/util.h" +#include "framework/common/debug/ge_log.h" #include "framework/omg/version.h" #include "graph/debug/ge_attr_define.h" #include "graph/load/new_model_manager/davinci_model_parser.h" @@ -27,10 +27,15 @@ #include "graph/utils/graph_utils.h" using domi::ModelTaskDef; +using ge::ModelBufferData; using ge::TBEKernelPtr; using ge::TBEKernelStore; using std::string; +namespace { +const int64_t kOriginalOmPartitionNum = 1; +} + namespace ge { FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelHelper::~ModelHelper() { (void)ReleaseLocalModelData(); } @@ -57,7 +62,8 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr &om_fil FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param, - const std::string &output_file) { + const std::string &output_file, + ModelBufferData &model) { if (output_file.empty()) { GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix"); return FAILED; @@ -85,13 +91,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod return PARAM_INVALID; } } - auto ge_model_weight = ge_model->GetWeight(); - GELOGI("WEIGHTS_DATA size is %zu", ge_model_weight.GetSize()); + GELOGI("WEIGHTS_DATA size is %zu , %p", ge_model_weight.GetSize(), ge_model_weight.GetData()); if (SaveModelPartition(om_file_save_helper, ModelPartitionType::WEIGHTS_DATA, ge_model_weight.GetData(), ge_model_weight.GetSize()) != SUCCESS) { - GELOGE(PARAM_INVALID, "Add weight partition failed"); - return PARAM_INVALID; + GELOGW("Add weight partition failed"); // weight is not necessary } TBEKernelStore tbe_kernel_store = ge_model->GetTBEKernelStore(); @@ -159,7 +163,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod string model_name = reinterpret_cast(model_header.name); GELOGI("Model name save:%s", model_name.c_str()); - Status ret = om_file_save_helper->SaveModel(save_param, output_file.c_str()); + Status ret = om_file_save_helper->SaveModel(save_param, output_file.c_str(), model, is_offline_); if (ret != SUCCESS) { GELOGE(FAILED, "OmFileSaveHelper SaveModel return fail."); return FAILED; @@ -223,12 +227,15 @@ ModelHelper::SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::strin GELOGE(FAILED, "ModelHelper SaveModel failed for platform_version"); return FAILED; } - err = memcpy_s(model_header.name, MODEL_NAME_LENGTH, model_ptr->GetName().c_str(), model_ptr->GetName().size() + 1); + size_t name_size = model_ptr->GetName().size(); + name_size = name_size > (MODEL_NAME_LENGTH - 1) ? (MODEL_NAME_LENGTH - 1) : name_size; + err = memcpy_s(model_header.name, MODEL_NAME_LENGTH, model_ptr->GetName().c_str(), name_size); if (err != EOK) { GELOGE(FAILED, "ModelHelper SaveModel memory copy failed"); return FAILED; } - Status ret = om_file_save_helper->SaveModelToFile(output_file.c_str()); + ModelBufferData model; + Status ret = om_file_save_helper->SaveModelToFile(output_file.c_str(), model, is_offline_); return (ret == SUCCESS ? SUCCESS : FAILED); } @@ -242,6 +249,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c GELOGE(FAILED, "Model helper has already loaded!"); return FAILED; } + if (ReleaseLocalModelData() != SUCCESS) { GELOGE(FAILED, "ReleaseLocalModelData failed."); return FAILED; @@ -260,7 +268,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c model_addr_tmp_ = nullptr; return FAILED; } - + auto partition_table = reinterpret_cast(model_addr_tmp_); + if (partition_table->num == kOriginalOmPartitionNum) { + GELOGE(FAILED, "om model is error,please use executable om model"); + return FAILED; + } // Encrypt model need to del temp model/no encrypt model don't need to del model model_addr_tmp_ = nullptr; @@ -299,7 +311,7 @@ Status ModelHelper::LoadModelData(OmFileLoadHelper &om_load_helper) { ModelPartition partition_model_def; // no need to check value, DATA->NetOutput om_load_helper.GetModelPartition(ModelPartitionType::MODEL_DEF, partition_model_def); - GELOGI("Model_def partition size:%u", partition_model_def.size); + GELOGI("Model_def partition addr:%p,size:%u", partition_model_def.data, partition_model_def.size); ge::Model model; if (ge::Model::Load(partition_model_def.data, partition_model_def.size, model) != SUCCESS) { @@ -346,7 +358,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(Om GELOGE(INTERNAL_ERROR, "ReadProtoFromArray failed."); return INTERNAL_ERROR; } - GELOGI("TASK_INFO op_size:%d, stream_num:%u", task->op().size(), task->stream_num()); + GELOGI("TASK_INFO op_size:%zu, stream_num:%u", task->op().size(), task->stream_num()); } model_->SetModelTaskDef(task); return SUCCESS; @@ -428,6 +440,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::TransModelT TBEKernelPtr tbe_kernel = node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); GE_IF_BOOL_EXEC(tbe_kernel == nullptr, continue); kernel_store.AddTBEKernel(tbe_kernel); + GELOGI("Add tbe kernel bin %s", tbe_kernel->GetName().c_str()); } } if (!kernel_store.Build()) { @@ -470,6 +483,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::TransGeMode GELOGE(MEMALLOC_FAILED, "alloc model attr task buffer failed!"); return MEMALLOC_FAILED; } + // no need to check value (void)model_task->SerializePartialToArray(buffer.GetData(), size); ret = ge::AttrUtils::SetZeroCopyBytes(model, MODEL_ATTR_TASKS, std::move(buffer)); if (!ret) { diff --git a/src/ge/common/helper/om_file_helper.cc b/src/ge/common/helper/om_file_helper.cc index dfe5c1d6..58477b4e 100644 --- a/src/ge/common/helper/om_file_helper.cc +++ b/src/ge/common/helper/om_file_helper.cc @@ -18,7 +18,6 @@ #include #include - #include "common/math/math_util.h" #include "common/auth/file_saver.h" #include "framework/common/debug/log.h" @@ -26,6 +25,9 @@ #include "framework/common/ge_inner_error_codes.h" #include "framework/common/util.h" +using ge::ModelBufferData; +using std::string; + namespace ge { // For Load FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(const ge::ModelData &model) { @@ -107,13 +109,16 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint } // Init partition table auto partition_table = reinterpret_cast(model_data); - if ((partition_table->num != PARTITION_SIZE) && (partition_table->num != PARTITION_SIZE - 1)) { + // Davinici model partition include graph-info weight-info task-info tbe-kernel : + // Original model partition include graph-info + if ((partition_table->num != PARTITION_SIZE) && (partition_table->num != (PARTITION_SIZE - 1)) && + (partition_table->num != 1)) { GELOGE(PARAM_INVALID, "Invalid partition_table->num:%u", partition_table->num); return PARAM_INVALID; } - size_t mem_offset = SIZE_OF_MODEL_PARTITION_TABLE(*partition_table); - GELOGI("sizeof(ModelFileHeader)=%zu, sizeof(ModelPartitionTable)=%zu", sizeof(ModelFileHeader), mem_offset); + GELOGI("ModelPartitionTable num :%u, ModelFileHeader length :%zu, ModelPartitionTable length :%zu", + partition_table->num, sizeof(ModelFileHeader), mem_offset); if (model_data_size <= mem_offset) { GELOGE(PARAM_INVALID, "invalid model data, partition_table->num:%u, model data size %u", partition_table->num, model_data_size); @@ -138,7 +143,7 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::vector - &OmFileSaveHelper::GetModelPartitions() const { + &OmFileSaveHelper::GetModelPartitions() const { return context_.partition_datas_; } @@ -162,7 +167,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelPartitionTable *OmFileSave } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileSaveHelper::AddPartition(ModelPartition &partition) { - if (CheckUint32AddOverflow(context_.model_data_len_, partition.size) != SUCCESS) { + if (ge::CheckUint32AddOverflow(context_.model_data_len_, partition.size) != SUCCESS) { GELOGE(FAILED, "UINT32 %u and %u addition can result in overflow!", context_.model_data_len_, partition.size); return FAILED; } @@ -171,20 +176,21 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileSaveHelper::AddPar return SUCCESS; } -Status OmFileSaveHelper::SaveModel(const SaveParam &save_param, const char *output_file) { +Status OmFileSaveHelper::SaveModel(const SaveParam &save_param, const char *output_file, ModelBufferData &model, + bool is_offline) { (void)save_param.cert_file; (void)save_param.ek_file; (void)save_param.encode_mode; (void)save_param.hw_key_file; (void)save_param.pri_key_file; - Status ret = SaveModelToFile(output_file); + Status ret = SaveModelToFile(output_file, model, is_offline); if (ret == SUCCESS) { GELOGI("Generate model with encrypt."); } return ret; } -Status OmFileSaveHelper::SaveModelToFile(const char *output_file) { +Status OmFileSaveHelper::SaveModelToFile(const char *output_file, ModelBufferData &model, bool is_offline) { #if !defined(NONSUPPORT_SAVE_TO_FILE) uint32_t model_data_len = context_.model_data_len_; if (model_data_len == 0) { @@ -205,7 +211,12 @@ Status OmFileSaveHelper::SaveModelToFile(const char *output_file) { sizeof(ModelFileHeader), size_of_table, model_data_len, model_header_.length + sizeof(ModelFileHeader)); std::vector partition_datas = context_.partition_datas_; - Status ret = FileSaver::SaveToFile(output_file, model_header_, *partition_table, partition_datas); + Status ret; + if (is_offline) { + ret = FileSaver::SaveToFile(output_file, model_header_, *partition_table, partition_datas); + } else { + ret = FileSaver::SaveToBuffWithFileHeader(model_header_, *partition_table, partition_datas, model); + } if (ret == SUCCESS) { GELOGI("Save model success without encrypt."); } diff --git a/src/ge/common/math/fp16_math.cc b/src/ge/common/math/fp16_math.cc new file mode 100644 index 00000000..56183ced --- /dev/null +++ b/src/ge/common/math/fp16_math.cc @@ -0,0 +1,171 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fp16_math.h" +#include "external/register/register_types.h" + +namespace ge { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t sqrt(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number square root + double dSqrt = std::sqrt(dVal); + // calculate result + ret = dSqrt; + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t rsqrt(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number square root and reciprocal + double drSqrt = 1.0 / std::sqrt(dVal); + // calculate result + ret = drSqrt; + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t rcp(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number reciprocal + double dRcp = 1.0 / dVal; + // calculate result + ret = dRcp; + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t exp(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number exponential + double dExp = std::exp(dVal); + // calculate result + ret = dExp; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t pow2(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number binary exponential + double dExp2 = std::pow(kDim2, dVal); + // calculate result + ret = dExp2; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t pow10(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number decimal exponential + double dExp10 = std::pow(kDim10, dVal); + // calculate result + ret = dExp10; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t ln(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number natural logarithm + double dLn = std::log(dVal); + // calculate result + ret = dLn; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t log2(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number binary logarithm + double dLog2 = std::log2(dVal); + // calculate result + ret = dLog2; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t log10(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number binary logarithm + double dLog10 = std::log10(dVal); + // calculate result + ret = dLog10; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t cos(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number cos + double dCos = std::cos(dVal); + // calculate result + ret = dCos; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t sin(fp16_t fp) { + fp16_t ret; + // Convert half precision float number to double + double dVal = fp; + // Calculate double number sine + double dSin = std::sin(dVal); + // calculate result + ret = dSin; + + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t abs(fp16_t fp) { + fp16_t ret; + ret.val = (fp.val & kFp16AbsMax); + return ret; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t max(fp16_t fp1, fp16_t fp2) { + if (fp1 >= fp2) { + return fp1; + } else { + return fp2; + } +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY fp16_t min(fp16_t fp1, fp16_t fp2) { + if (fp1 <= fp2) { + return fp1; + } else { + return fp2; + } +} +} // namespace ge diff --git a/src/ge/common/math/fp16_math.h b/src/ge/common/math/fp16_math.h new file mode 100644 index 00000000..5bc9ac6d --- /dev/null +++ b/src/ge/common/math/fp16_math.h @@ -0,0 +1,96 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_MATH_FP16_MATH_H_ +#define GE_COMMON_MATH_FP16_MATH_H_ + +#include "common/fp16_t.h" + +namespace ge { +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculates fp16_t square root function of input fp +/// @return Returns fp16_t square root of fp +fp16_t sqrt(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculates fp16_t reciprocal square root function of input fp +/// @return Returns fp16_t reciprocal square root of fp +fp16_t rsqrt(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculates fp16_t reciprocal function of input fp +/// @return Returns fp16_t reciprocal of fp +fp16_t rcp(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculates fp16_t natural exponential function of input fp +/// @return Returns fp16_t natural exponential function of fp +fp16_t exp(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculates fp16_t binary exponential function of input fp +/// @return Returns fp16_t binary exponential function of fp +fp16_t pow2(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp_v fp16_t object to be calculate +/// @brief Calculates fp16_t decimal exponential function of input fp +/// @return Returns fp16_t decimal exponential function of fp +fp16_t pow10(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp_v fp16_t object to be calculate +/// @brief Calculate fp16_t natural logarithm of fp16_t +/// @return Returns fp16_t natural logarithm of fp +fp16_t ln(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculate fp16_t binary logarithm of fp16_t +/// @return Returns fp16_t binary logarithm of fp +fp16_t log2(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculate fp16_t decimal logarithm of fp16_t +/// @return Returns fp16_t decimal logarithm of fp +fp16_t log10(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculate fp16_t cosine of fp16_t +/// @return Returns fp16_t cosine of fp +fp16_t cos(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculate fp16_t sine of fp16_t +/// @return Returns fp16_t sine of fp +fp16_t sin(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp fp16_t object to be calculate +/// @brief Calculate the absolute value(the sign bit is 0) of the give value +/// @return Returns fp16_t absolute value(the sign bit is 0) of of fp +fp16_t abs(fp16_t fp); +/// @ingroup fp16_t mathematics method +/// @param [in] fp1 fp16_t object to be compare +/// @param [in] fp2 fp16_t object to be compare +/// @brief Calculate the maximum fp16_t of fp1 and fp2 +/// @return Returns maximum fp16_t of fp1 and fp2 +fp16_t max(fp16_t fp1, fp16_t fp2); +/// @ingroup fp16_t mathematics method +/// @param [in] fp1 fp16_t object to be compare +/// @param [in] fp2 fp16_t object to be compare +/// @brief Calculate the minimum fp16_t of fp1 and fp2 +/// @return Returns minimum fp16_t of fp1 and fp2 +fp16_t min(fp16_t fp1, fp16_t fp2); +}; // namespace ge +#endif // GE_COMMON_MATH_FP16_MATH_H_ \ No newline at end of file diff --git a/src/ge/common/math/math_util.h b/src/ge/common/math/math_util.h index 8a78317e..56148240 100644 --- a/src/ge/common/math/math_util.h +++ b/src/ge/common/math/math_util.h @@ -22,101 +22,283 @@ #include #include +#include "common/fp16_t.h" #include "framework/common/debug/log.h" #include "framework/common/fmk_error_codes.h" +using ge::fp16_t; namespace ge { -/// /// @ingroup math_util /// @brief check whether int32 addition can result in overflow /// @param [in] a addend /// @param [in] b addend /// @return Status -/// inline Status CheckIntAddOverflow(int a, int b) { if (((b > 0) && (a > (INT_MAX - b))) || ((b < 0) && (a < (INT_MIN - b)))) { return FAILED; } return SUCCESS; } - -/// /// @ingroup math_util -/// @brief check whether int64 addition can result in overflow +/// @brief check whether int8 addition can result in overflow /// @param [in] a addend /// @param [in] b addend /// @return Status -/// -inline Status CheckInt64AddOverflow(int64_t a, int64_t b) { - if (((b > 0) && (a > (INT64_MAX - b))) || ((b < 0) && (a < (INT64_MIN - b)))) { +inline Status CheckInt8AddOverflow(int8_t a, int8_t b) { + if (((b > 0) && (a > (INT8_MAX - b))) || ((b < 0) && (a < (INT8_MIN - b)))) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether int16 addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckInt16AddOverflow(int16_t a, int16_t b) { + if (((b > 0) && (a > (INT16_MAX - b))) || ((b < 0) && (a < (INT16_MIN - b)))) { return FAILED; } return SUCCESS; } - -/// /// @ingroup math_util /// @brief check whether int32 addition can result in overflow /// @param [in] a addend /// @param [in] b addend /// @return Status -/// inline Status CheckInt32AddOverflow(int32_t a, int32_t b) { if (((b > 0) && (a > (INT32_MAX - b))) || ((b < 0) && (a < (INT32_MIN - b)))) { return FAILED; } return SUCCESS; } - -/// +/// @ingroup math_util +/// @brief check whether int64 addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckInt64AddOverflow(int64_t a, int64_t b) { + if (((b > 0) && (a > (INT64_MAX - b))) || ((b < 0) && (a < (INT64_MIN - b)))) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint8 addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckUint8AddOverflow(uint8_t a, uint8_t b) { + if (a > (UINT8_MAX - b)) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint16 addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckUint16AddOverflow(uint16_t a, uint16_t b) { + if (a > (UINT16_MAX - b)) { + return FAILED; + } + return SUCCESS; +} /// @ingroup math_util /// @brief check whether uint32 addition can result in overflow /// @param [in] a addend /// @param [in] b addend /// @return Status -/// inline Status CheckUint32AddOverflow(uint32_t a, uint32_t b) { if (a > (UINT32_MAX - b)) { return FAILED; } return SUCCESS; } - -/// +/// @ingroup math_util +/// @brief check whether uint64 addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckUint64AddOverflow(uint64_t a, uint64_t b) { + if (a > (UINT64_MAX - b)) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether fp16_t addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckFp16AddOverflow(fp16_t a, fp16_t b) { + fp16_t result = static_cast(a) + static_cast(b); + if (FP16_IS_INVALID(result.val)) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether float addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckFloatAddOverflow(float a, float b) { + if (std::isfinite(static_cast(a) + static_cast(b)) == false) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether double addition can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckDoubleAddOverflow(double a, double b) { + if (std::isfinite(static_cast(a) + static_cast(b)) == false) { + return FAILED; + } + return SUCCESS; +} /// @ingroup math_util /// @brief check whether int subtraction can result in overflow /// @param [in] a subtrahend /// @param [in] b minuend /// @return Status -/// inline Status CheckIntSubOverflow(int a, int b) { if (((b > 0) && (a < (INT_MIN + b))) || ((b < 0) && (a > (INT_MAX + b)))) { return FAILED; } return SUCCESS; } - -/// +/// @ingroup math_util +/// @brief check whether int8 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckInt8SubOverflow(int8_t a, int8_t b) { + if (((b > 0) && (a < (INT8_MIN + b))) || ((b < 0) && (a > (INT8_MAX + b)))) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether int16 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckInt16SubOverflow(int16_t a, int16_t b) { + if (((b > 0) && (a < (INT16_MIN + b))) || ((b < 0) && (a > (INT16_MAX + b)))) { + return FAILED; + } + return SUCCESS; +} /// @ingroup math_util /// @brief check whether int32 subtraction can result in overflow /// @param [in] a subtrahend /// @param [in] b minuend /// @return Status -/// inline Status CheckInt32SubOverflow(int32_t a, int32_t b) { if (((b > 0) && (a < (INT32_MIN + b))) || ((b < 0) && (a > (INT32_MAX + b)))) { return FAILED; } return SUCCESS; } - -/// +/// @ingroup math_util +/// @brief check whether int64 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckInt64SubOverflow(int64_t a, int64_t b) { + if (((b > 0) && (a < (INT64_MIN + b))) || ((b < 0) && (a > (INT64_MAX + b)))) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint8 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckUint8SubOverflow(uint8_t a, uint8_t b) { + if (a < b) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint16 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckUint16SubOverflow(uint16_t a, uint16_t b) { + if (a < b) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint32 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckUint32SubOverflow(uint32_t a, uint32_t b) { + if (a < b) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint64 subtraction can result in overflow +/// @param [in] a subtrahend +/// @param [in] b minuend +/// @return Status +inline Status CheckUint64SubOverflow(uint64_t a, uint64_t b) { + if (a < b) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether fp16_t subtraction can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckFp16SubOverflow(fp16_t a, fp16_t b) { + fp16_t result = static_cast(a) - static_cast(b); + if (FP16_IS_INVALID(result.val)) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether float subtraction can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckFloatSubOverflow(float a, float b) { + if (std::isfinite(static_cast(a) - static_cast(b)) == false) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether double subtraction can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckDoubleSubOverflow(double a, double b) { + if (std::isfinite(static_cast(a) - static_cast(b)) == false) { + return FAILED; + } + return SUCCESS; +} /// @ingroup math_util /// @brief check whether int multiplication can result in overflow /// @param [in] a multiplicator /// @param [in] b multiplicator /// @return Status -/// inline Status CheckIntMulOverflow(int a, int b) { if (a > 0) { if (b > 0) { @@ -141,14 +323,69 @@ inline Status CheckIntMulOverflow(int a, int b) { } return SUCCESS; } - -/// +/// @ingroup math_util +/// @brief check whether int8 multiplication can result in overflow +/// @param [in] a multiplicator +/// @param [in] b multiplicator +/// @return Status +inline Status CheckInt8MulOverflow(int8_t a, int8_t b) { + if (a > 0) { + if (b > 0) { + if (a > (INT8_MAX / b)) { + return FAILED; + } + } else { + if (b < (INT8_MIN / a)) { + return FAILED; + } + } + } else { + if (b > 0) { + if (a < (INT8_MIN / b)) { + return FAILED; + } + } else { + if ((a != 0) && (b < (INT8_MAX / a))) { + return FAILED; + } + } + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether int16 multiplication can result in overflow +/// @param [in] a multiplicator +/// @param [in] b multiplicator +/// @return Status +inline Status CheckInt16MulOverflow(int16_t a, int16_t b) { + if (a > 0) { + if (b > 0) { + if (a > (INT16_MAX / b)) { + return FAILED; + } + } else { + if (b < (INT16_MIN / a)) { + return FAILED; + } + } + } else { + if (b > 0) { + if (a < (INT16_MIN / b)) { + return FAILED; + } + } else { + if ((a != 0) && (b < (INT16_MAX / a))) { + return FAILED; + } + } + } + return SUCCESS; +} /// @ingroup math_util /// @brief check whether int32 multiplication can result in overflow /// @param [in] a multiplicator /// @param [in] b multiplicator /// @return Status -/// inline Status CheckInt32MulOverflow(int32_t a, int32_t b) { if (a > 0) { if (b > 0) { @@ -206,13 +443,40 @@ inline Status CheckInt64Int32MulOverflow(int64_t a, int32_t b) { return SUCCESS; } -/// /// @ingroup math_util /// @brief check whether int64 multiplication can result in overflow /// @param [in] a multiplicator /// @param [in] b multiplicator /// @return Status -/// +inline Status Int64MulCheckOverflow(int64_t a, int64_t b) { + if (a > 0) { + if (b > 0) { + if (a > (INT64_MAX / b)) { + return FAILED; + } + } else { + if (b < (INT64_MIN / a)) { + return FAILED; + } + } + } else { + if (b > 0) { + if (a < (INT64_MIN / b)) { + return FAILED; + } + } else { + if ((a != 0) && (b < (INT64_MAX / a))) { + return FAILED; + } + } + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether int64 multiplication can result in overflow +/// @param [in] a multiplicator +/// @param [in] b multiplicator +/// @return Status inline Status CheckInt64Uint32MulOverflow(int64_t a, uint32_t b) { if (a == 0 || b == 0) { return SUCCESS; @@ -228,14 +492,43 @@ inline Status CheckInt64Uint32MulOverflow(int64_t a, uint32_t b) { } return SUCCESS; } +/// @ingroup math_util +/// @brief check whether uint8 multiplication can result in overflow +/// @param [in] a multiplicator +/// @param [in] b multiplicator +/// @return Status +inline Status CheckUint8MulOverflow(uint8_t a, uint8_t b) { + if (a == 0 || b == 0) { + return SUCCESS; + } -/// + if (a > (UINT8_MAX / b)) { + return FAILED; + } + + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether uint16 multiplication can result in overflow +/// @param [in] a multiplicator +/// @param [in] b multiplicator +/// @return Status +inline Status CheckUint16MulOverflow(uint16_t a, uint16_t b) { + if (a == 0 || b == 0) { + return SUCCESS; + } + + if (a > (UINT16_MAX / b)) { + return FAILED; + } + + return SUCCESS; +} /// @ingroup math_util /// @brief check whether uint32 multiplication can result in overflow /// @param [in] a multiplicator /// @param [in] b multiplicator /// @return Status -/// inline Status CheckUint32MulOverflow(uint32_t a, uint32_t b) { if (a == 0 || b == 0) { return SUCCESS; @@ -247,28 +540,73 @@ inline Status CheckUint32MulOverflow(uint32_t a, uint32_t b) { return SUCCESS; } +/// @ingroup math_util +/// @brief check whether uint64 multiplication can result in overflow +/// @param [in] a multiplicator +/// @param [in] b multiplicator +/// @return Status +inline Status CheckUint64MulOverflow(uint64_t a, uint64_t b) { + if (a == 0 || b == 0) { + return SUCCESS; + } -/// + if (a > (UINT64_MAX / b)) { + return FAILED; + } + + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether fp16_t multiplication can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckFp16MulOverflow(fp16_t a, fp16_t b) { + fp16_t result = static_cast(a) * static_cast(b); + printf("result: %u, 0x%x\n", result.val, result.val); + if (FP16_IS_INVALID(result.val)) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether float multiplication can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckFloatMulOverflow(float a, float b) { + if (std::isfinite(static_cast(a) * static_cast(b)) == false) { + return FAILED; + } + return SUCCESS; +} +/// @ingroup math_util +/// @brief check whether double multiplication can result in overflow +/// @param [in] a addend +/// @param [in] b addend +/// @return Status +inline Status CheckDoubleMulOverflow(double a, double b) { + if (std::isfinite(static_cast(a) * static_cast(b)) == false) { + return FAILED; + } + return SUCCESS; +} /// @ingroup math_util /// @brief check whether int division can result in overflow /// @param [in] a dividend /// @param [in] b divisor /// @return Status -/// inline Status CheckIntDivOverflow(int a, int b) { if ((b == 0) || ((a == INT_MIN) && (b == -1))) { return FAILED; } return SUCCESS; } - -/// /// @ingroup math_util /// @brief check whether int32 division can result in overflow /// @param [in] a dividend /// @param [in] b divisor /// @return Status -/// inline Status CheckInt32DivOverflow(int32_t a, int32_t b) { if ((b == 0) || ((a == INT32_MIN) && (b == -1))) { return FAILED; @@ -276,73 +614,278 @@ inline Status CheckInt32DivOverflow(int32_t a, int32_t b) { return SUCCESS; } -#define FMK_INT_ADDCHECK(a, b) \ - if (CheckIntAddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int %d and %d addition can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT_ADDCHECK(a, b) \ + if (ge::CheckIntAddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "Int %d and %d addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT8_ADDCHECK(a, b) \ + if (ge::CheckInt8AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "Int8 %d and %d addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT16_ADDCHECK(a, b) \ + if (ge::CheckInt16AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "Int16 %d and %d addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT32_ADDCHECK(a, b) \ + if (ge::CheckInt32AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "Int32 %d and %d addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT64_ADDCHECK(a, b) \ + if (ge::CheckInt64AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "Int64 %ld and %ld addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_UINT8_ADDCHECK(a, b) \ + if (ge::CheckUint8AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT8 %u and %u addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT32_ADDCHECK(a, b) \ - if (CheckInt32AddOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "Int32 %d and %d addition can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT16_ADDCHECK(a, b) \ + if (ge::CheckUint16AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT16 %u and %u addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } #define FMK_UINT32_ADDCHECK(a, b) \ - if (CheckUint32AddOverflow((a), (b)) != SUCCESS) { \ + if (ge::CheckUint32AddOverflow((a), (b)) != SUCCESS) { \ GELOGE(INTERNAL_ERROR, "UINT32 %u and %u addition can result in overflow!", static_cast(a), \ static_cast(b)); \ return INTERNAL_ERROR; \ } -#define FMK_INT_SUBCHECK(a, b) \ - if (CheckIntSubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT %d and %d subtraction can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT64_ADDCHECK(a, b) \ + if (ge::CheckUint64AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_FP16_ADDCHECK(a, b) \ + if (ge::CheckFp16AddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "fp16 %f and %f addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_FLOAT_ADDCHECK(a, b) \ + if (ge::CheckFloatAddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "float %f and %f addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_DOUBLE_ADDCHECK(a, b) \ + if (ge::CheckDoubleAddOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "double %lf and %lf addition can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT_SUBCHECK(a, b) \ + if (ge::CheckIntSubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT %d and %d subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT8_SUBCHECK(a, b) \ + if (ge::CheckInt8SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT8 %d and %d subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT16_SUBCHECK(a, b) \ + if (ge::CheckInt16SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT16 %d and %d subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT32_SUBCHECK(a, b) \ + if (ge::CheckInt32SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT32 %d and %d subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT64_SUBCHECK(a, b) \ + if (ge::CheckInt64SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT64 %ld and %ld subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_UINT8_SUBCHECK(a, b) \ + if (ge::CheckUint8SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT8 %u and %u subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_UINT16_SUBCHECK(a, b) \ + if (ge::CheckUint16SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT16 %u and %u subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_UINT32_SUBCHECK(a, b) \ + if (ge::CheckUint32SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT32 %u and %u subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_UINT64_SUBCHECK(a, b) \ + if (ge::CheckUint64SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_FP16_SUBCHECK(a, b) \ + if (ge::CheckFp16SubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "fp16 %f and %f subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_FLOAT_SUBCHECK(a, b) \ + if (ge::CheckFloatSubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "float %f and %f subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_DOUBLE_SUBCHECK(a, b) \ + if (ge::CheckDoubleSubOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "double %lf and %lf subtraction can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT_MULCHECK(a, b) \ + if (ge::CheckIntMulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT %d and %d multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT8_MULCHECK(a, b) \ + if (ge::CheckInt8MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT8 %d and %d multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT16_MULCHECK(a, b) \ + if (ge::CheckInt16MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT16 %d and %d multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_INT32_MULCHECK(a, b) \ + if (ge::CheckInt32MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT32 %d and %d multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT32_SUBCHECK(a, b) \ - if (CheckInt32SubOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT32 %d and %d subtraction can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT64_MULCHECK(a, b) \ + if (ge::Int64MulCheckOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT64 %ld and %ld multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT_MULCHECK(a, b) \ - if (CheckIntMulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT %d and %d multiplication can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT8_MULCHECK(a, b) \ + if (ge::CheckUint8MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT8 %u and %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT32_MULCHECK(a, b) \ - if (CheckInt32MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT32 %d and %d multiplication can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT16_MULCHECK(a, b) \ + if (ge::CheckUint16MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT16 %u and %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } #define FMK_UINT32_MULCHECK(a, b) \ - if (CheckUint32MulOverflow((a), (b)) != SUCCESS) { \ + if (ge::CheckUint32MulOverflow((a), (b)) != SUCCESS) { \ GELOGE(INTERNAL_ERROR, "UINT32 %u and %u multiplication can result in overflow!", static_cast(a), \ static_cast(b)); \ return INTERNAL_ERROR; \ } -#define FMK_INT_DIVCHECK(a, b) \ - if (CheckIntDivOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT %d and %d division can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_UINT64_MULCHECK(a, b) \ + if (ge::CheckUint64MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "UINT64 %lu and %lu multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_FP16_MULCHECK(a, b) \ + if (ge::CheckFp16MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "fp16 %f and %f multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_FLOAT_MULCHECK(a, b) \ + if (ge::CheckFloatMulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "float %f and %f multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } + +#define FMK_DOUBLE_MULCHECK(a, b) \ + if (ge::CheckDoubleMulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "double %lf and %lf multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT32_DIVCHECK(a, b) \ - if (CheckInt32DivOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT32 %d and %d division can result in overflow!", (a), (b)); \ - return INTERNAL_ERROR; \ +#define FMK_INT_DIVCHECK(a, b) \ + if (CheckIntDivOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT %d and %d division can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ } -#define FMK_INT64_UINT32_MULCHECK(a, b) \ - if (CheckInt64Uint32MulOverflow((a), (b)) != SUCCESS) { \ - GELOGE(INTERNAL_ERROR, "INT64 %ld and UINT32 %u multiplication can result in overflow!", (a), (b)); \ +#define FMK_INT32_DIVCHECK(a, b) \ + if (CheckInt32DivOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT32 %d and %d division can result in overflow!", static_cast(a), \ + static_cast(b)); \ return INTERNAL_ERROR; \ } + +#define FMK_INT64_UINT32_MULCHECK(a, b) \ + if (ge::CheckInt64Uint32MulOverflow((a), (b)) != SUCCESS) { \ + GELOGE(INTERNAL_ERROR, "INT64 %ld and UINT32 %u multiplication can result in overflow!", static_cast(a), \ + static_cast(b)); \ + return INTERNAL_ERROR; \ + } } // namespace ge #endif // GE_COMMON_MATH_MATH_UTIL_H_ diff --git a/src/ge/common/math_util.h b/src/ge/common/math_util.h index 150f4154..5e783e81 100644 --- a/src/ge/common/math_util.h +++ b/src/ge/common/math_util.h @@ -21,12 +21,10 @@ #include #include -#include "Eigen/Eigen" #include "framework/common/debug/log.h" #include "framework/common/types.h" #include "framework/common/util.h" #include "mmpa/mmpa_api.h" -#include "unsupported/Eigen/CXX11/Tensor" namespace ge { @@ -70,6 +68,6 @@ Status NnSet(const int32_t n, const Dtype alpha, Dtype *output) { return SUCCESS; } -} // namespace ge +} // end namespace ge #endif // GE_COMMON_MATH_UTIL_H_ diff --git a/src/ge/common/model_parser/base.cc b/src/ge/common/model_parser/base.cc index 79b885ed..a9a21ec5 100644 --- a/src/ge/common/model_parser/base.cc +++ b/src/ge/common/model_parser/base.cc @@ -47,7 +47,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::LoadFro // get length of file: (void)fs.seekg(0, std::ifstream::end); - int64_t len = fs.tellg(); + uint32_t len = static_cast(fs.tellg()); GE_CHECK_GE(len, 1); @@ -84,10 +84,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::ParseMo auto file_header = reinterpret_cast(model.model_data); // Determine whether the file length and magic number match GE_CHK_BOOL_RET_STATUS( - file_header->length == model.model_len - sizeof(ModelFileHeader) && file_header->magic == MODEL_FILE_MAGIC_NUM, - PARAM_INVALID, - "Invalid model. file_header->length + sizeof(ModelFileHeader) != model->model_len || MODEL_FILE_MAGIC_NUM != " - "file_header->magic"); + file_header->length == model.model_len - sizeof(ModelFileHeader) && file_header->magic == MODEL_FILE_MAGIC_NUM, + PARAM_INVALID, + "Invalid model. file_header->length + sizeof(ModelFileHeader) != model->model_len || MODEL_FILE_MAGIC_NUM != " + "file_header->magic"); Status res = SUCCESS; @@ -99,7 +99,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelParserBase::ParseMo model_data = data; model_len = file_header->length; - GELOGI("model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); + GELOGI("Model_len is %u, model_file_head_len is %zu.", model_len, sizeof(ModelFileHeader)); } else { GELOGE(PARAM_INVALID, "Invalid model. ModelEncryptType not supported."); res = PARAM_INVALID; diff --git a/src/ge/common/model_parser/base.h b/src/ge/common/model_parser/base.h index a78e28f1..22d58ace 100644 --- a/src/ge/common/model_parser/base.h +++ b/src/ge/common/model_parser/base.h @@ -28,40 +28,40 @@ namespace ge { class ModelParserBase { public: - /// - /// @ingroup hiai - /// @brief constructor - /// + /** + * @ingroup hiai + * @brief constructor + */ ModelParserBase(); - /// - /// @ingroup hiai - /// @brief destructor - /// + /** + * @ingroup hiai + * @brief destructor + */ ~ModelParserBase(); - /// - /// @ingroup hiai - /// @brief Parsing a model file - /// @param [in] model_file model path - /// @param [in] model_key model secret key - /// @param [in] priority modle priority - /// @param [out] model_data model data - /// @return Status result - /// + /** + * @ingroup hiai + * @brief Parsing a model file + * @param [in] model_file model path + * @param [in] model_key model secret key + * @param [in] priority modle priority + * @param [out] model_data model data + * @return Status result + */ static Status LoadFromFile(const char *model_file, const char *model_key, int32_t priority, ge::ModelData &model_data); - /// - /// @ingroup domi_ome - /// @brief Parse model contents from the ModelData - /// @param [in] model model data read from file - /// @param [out] model_data address of the model data - /// @param [out] model_len model actual length - /// If the input is an encrypted model, it needs to be deleted - /// @return SUCCESS success - /// @return others failure - /// @author - /// + /** + * @ingroup domi_ome + * @brief Parse model contents from the ModelData + * @param [in] model model data read from file + * @param [out] model_data address of the model data + * @param [out] model_len model actual length + * If the input is an encrypted model, it needs to be deleted + * @return SUCCESS success + * @return others failure + * @author + */ static Status ParseModelContent(const ge::ModelData &model, uint8_t *&model_data, uint32_t &model_len); }; } // namespace ge diff --git a/src/ge/common/model_saver.cc b/src/ge/common/model_saver.cc index c3b780f7..f68051f4 100644 --- a/src/ge/common/model_saver.cc +++ b/src/ge/common/model_saver.cc @@ -57,16 +57,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelSaver::SaveJsonToFi mode_t mode = S_IRUSR | S_IWUSR; int32_t fd = mmOpen2(real_path, O_RDWR | O_CREAT | O_TRUNC, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { - GELOGE(FAILED, "Open file failed. file path : %s", file_path); + GELOGE(FAILED, "Open file failed. file path : %s, %s", file_path, strerror(errno)); return FAILED; } const char *model_char = model_str.c_str(); uint32_t len = static_cast(model_str.length()); // Write data to file - int32_t mmpa_ret = mmWrite(fd, const_cast((const void *)model_char), len); + mmSsize_t mmpa_ret = mmWrite(fd, const_cast((const void *)model_char), len); if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { // Need to both print the error info of mmWrite and mmClose, so return ret after mmClose - GELOGE(FAILED, "Write to file failed. errno = %d", mmpa_ret); + GELOGE(FAILED, "Write to file failed. errno = %d, %s", mmpa_ret, strerror(errno)); ret = FAILED; } // Close file diff --git a/src/ge/common/op/attr_define.cc b/src/ge/common/op/attr_define.cc index a2f703ed..f9929a5e 100644 --- a/src/ge/common/op/attr_define.cc +++ b/src/ge/common/op/attr_define.cc @@ -651,6 +651,8 @@ const std::string ATTR_MODEL_STREAM_NUM = "stream_num"; const std::string ATTR_MODEL_EVENT_NUM = "event_num"; +const std::string ATTR_MODEL_LABEL_NUM = "label_num"; + const std::string ATTR_MODEL_MEMORY_SIZE = "memory_size"; const std::string ATTR_MODEL_WEIGHT_SIZE = "weight_size"; @@ -783,7 +785,7 @@ const std::string LOG_TIME_STAMP_NOTIFY = "notify"; /*ShapeN*/ const std::string SHAPEN_ATTR_N = "N"; const std::string SHAPEN_ATTR_IN_TYPE = "in_type"; -const std::string SHAPEN_ATTR_OUT_TYPE = "out_type"; +const std::string SHAPEN_ATTR_OUT_TYPE = "dtype"; /* control flow */ const std::string ATTR_NAME_ITERATORS_PER_LOOP = "iterations_per_loop"; @@ -807,4 +809,6 @@ const std::string ATTR_NAME_LINK_WITH_SPARE = "link_with_sparse"; const std::string ATTR_NAME_NET_OUTPUT_FORMAT = "net_output_format"; const std::string ATTR_NAME_NET_OUTPUT_DATATYPE = "net_output_datatype"; +/* For constant folding */ +const std::string ATTR_NO_NEED_CONSTANT_FOLDING = "no_need_constant_folding"; } // namespace domi diff --git a/src/ge/common/op/attr_value_util.cc b/src/ge/common/op/attr_value_util.cc index 957e558d..5d74aa1d 100644 --- a/src/ge/common/op/attr_value_util.cc +++ b/src/ge/common/op/attr_value_util.cc @@ -15,7 +15,6 @@ */ #include "framework/common/op/attr_value_util.h" - #include "framework/common/debug/log.h" #include "framework/common/util.h" @@ -84,30 +83,27 @@ DEFINE_SET_ATTR_VALUE_LIST(const std::string &, s); ADD_TO_ATTR_MAP(map_key, value, attr) \ } \ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void AddOpAttr(KEY_TYPE map_key, VALUE_TYPE value, \ - AttrDefMap *attr_map) { \ - ADD_TO_ATTR_MAP(map_key, value, attr_map) \ - } \ - FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void AddModelAttr(KEY_TYPE map_key, VALUE_TYPE value, \ - ModelDef *model_def) { \ + AttrDefMap *attr_map){ \ + ADD_TO_ATTR_MAP(map_key, value, attr_map)} FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void \ + AddModelAttr(KEY_TYPE map_key, VALUE_TYPE value, ModelDef *model_def) { \ GE_CHECK_NOTNULL_JUST_RETURN(model_def); \ auto attr = model_def->mutable_attr(); \ ADD_TO_ATTR_MAP(map_key, value, attr) \ } -#define DEFINE_ADD_ATTR_VALUE_LIST(KEY_TYPE, VALUE_TYPE) \ - FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void AddOpAttrList(KEY_TYPE map_key, VALUE_TYPE value, \ - OpDef *op_def) { \ - GE_CHECK_NOTNULL_JUST_RETURN(op_def); \ - auto attr = op_def->mutable_attr(); \ - ADD_TO_ATTR_MAP_LIST(map_key, value, attr) \ - } \ - FMK_FUNC_DEV_VISIBILITY void AddOpAttrList(KEY_TYPE map_key, VALUE_TYPE value, AttrDefMap *attr_map) { \ - ADD_TO_ATTR_MAP_LIST(map_key, value, attr_map) \ - } \ - FMK_FUNC_DEV_VISIBILITY void AddModelAttrList(KEY_TYPE map_key, VALUE_TYPE value, ModelDef *model_def) { \ - GE_CHECK_NOTNULL_JUST_RETURN(model_def); \ - auto attr = model_def->mutable_attr(); \ - ADD_TO_ATTR_MAP_LIST(map_key, value, attr) \ +#define DEFINE_ADD_ATTR_VALUE_LIST(KEY_TYPE, VALUE_TYPE) \ + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void AddOpAttrList(KEY_TYPE map_key, VALUE_TYPE value, \ + OpDef *op_def) { \ + GE_CHECK_NOTNULL_JUST_RETURN(op_def); \ + auto attr = op_def->mutable_attr(); \ + ADD_TO_ATTR_MAP_LIST(map_key, value, attr) \ + } \ + FMK_FUNC_DEV_VISIBILITY void AddOpAttrList(KEY_TYPE map_key, VALUE_TYPE value, AttrDefMap *attr_map){ \ + ADD_TO_ATTR_MAP_LIST(map_key, value, attr_map)} FMK_FUNC_DEV_VISIBILITY void \ + AddModelAttrList(KEY_TYPE map_key, VALUE_TYPE value, ModelDef *model_def) { \ + GE_CHECK_NOTNULL_JUST_RETURN(model_def); \ + auto attr = model_def->mutable_attr(); \ + ADD_TO_ATTR_MAP_LIST(map_key, value, attr) \ } DEFINE_ADD_ATTR_VALUE(const std::string &, const std::string &); @@ -157,16 +153,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void AddOpAttr(const std::strin return false; \ } -#define DEFINE_GET_ATTR_CONST_POINT_REF(ARG_TYPE_KEY, ARG_TYPE_VALUE, FIELD) \ - FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool GetAttrDefValue( \ - ARG_TYPE_KEY map_key, const ARG_TYPE_VALUE *&value, const AttrDefMap &attr) { \ - auto it = attr.find(map_key); \ - if (it == attr.end()) { \ - return false; \ - } \ - \ - value = &(it->second.FIELD()); \ - return true; \ +#define DEFINE_GET_ATTR_CONST_POINT_REF(ARG_TYPE_KEY, ARG_TYPE_VALUE, FIELD) \ + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool GetAttrDefValue( \ + ARG_TYPE_KEY map_key, const ARG_TYPE_VALUE *&value, const AttrDefMap &attr) { \ + auto it = attr.find(map_key); \ + if (it == attr.end()) { \ + return false; \ + } \ + \ + value = &(it->second.FIELD()); \ + return true; \ } #define DEFINE_GET_BYTES_ATTR_VALUE(ARG_TYPE_KEY, ARG_TYPE_VALUE) \ diff --git a/src/ge/common/op/ge_op_utils.cc b/src/ge/common/op/ge_op_utils.cc index 2a8d81ea..b3bed399 100644 --- a/src/ge/common/op/ge_op_utils.cc +++ b/src/ge/common/op/ge_op_utils.cc @@ -18,6 +18,7 @@ #include +#include "common/fp16_t.h" #include "common/ge/ge_util.h" #include "external/graph/types.h" #include "framework/common/debug/ge_log.h" @@ -33,23 +34,11 @@ #include "graph/utils/type_utils.h" #include "mmpa/mmpa_api.h" -#define RETURN_IF_TRUE(cond, errcode, ...) \ - do { \ - if (cond) { \ - GELOGE(errcode, __VA_ARGS__); \ - return errcode; \ - } \ - } while (0); - -using domi::DOMI_TENSOR_NCHW; +using ge::fp16_t; using std::vector; namespace ge { // General constant -const int32_t kDimMaxSize = 8; -const float DEFAULT_ALPHA_VALUE = 1.0; -const float DEFAULT_BETA_VALUE = 0.0; -const int NORMAL_TENSOR_SIZE = 4; const int32_t kDimSizeZero = 0; const int32_t kDimSizeOne = 1; const int32_t kDimSizeTwo = 2; @@ -58,13 +47,13 @@ const uint32_t kSliceDataNum = 2; // Add Sub Mul const uint32_t ADD_INPUT_NUM = 2; -const uint32_t SUB_INPUT_NUM = 2; const uint32_t MUL_INPUT_NUM = 2; // Permute const int32_t PERMUTE_ORDER_NUM = 4; // Ssd PriroBox const double SSD_PRIORBOX_ASPECT_RATIO_VALUE = 1.0; + // Switch const uint32_t SWITCH_INPUT_NUM = 2; const uint32_t SWITCH_OUTPUT_NUM = 2; @@ -73,6 +62,15 @@ const uint32_t SWITCH_TRUE_OUTPUT = 1; const uint32_t SWITCH_DATA_INPUT = 0; const uint32_t SWITCH_PRED_INPUT = 1; +// FunctionOp +const uint32_t IF_COND_INPUT = 0; +const uint32_t FOR_START_INPUT = 0; +const uint32_t FOR_LIMIT_INPUT = 1; +const uint32_t FOR_DELTA_INPUT = 2; +const uint32_t FOR_DATA_INPUT = 3; + +const int NORMAL_TENSOR_SIZE = 4; + // Get the value of key from attr #define AIPP_GET_ATTR_VALUE(KEY, ATTR_TYPE) \ if (aipp_attr.GetItem(#KEY).GetValue(KEY) != SUCCESS) { \ @@ -221,88 +219,128 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OpUtils::TransferDim(con return SUCCESS; } -void OpUtils::SliceData(std::vector &input, int64_t chunk_size, std::vector &output, int64_t begin, - int64_t out_dim, int64_t stride) { +template +void OpUtils::SliceData(const std::vector &input, int64_t chunk_size, std::vector &output, + int64_t begin, int64_t out_dim, int64_t stride) { char *slice = nullptr; + // chunk_size * (begin + (out_dim-1)*stride) always less than chunk_size * dim_i, no need to check. for (size_t j = 0; j < input.size(); j++) { - slice = input[j] + sizeof(int32_t) * begin * chunk_size; + slice = input[j] + sizeof(T) * begin * chunk_size; for (int64_t i = 0; i < out_dim; i++) { - output.push_back(slice + sizeof(int32_t) * i * chunk_size * stride); + output.push_back(slice + sizeof(T) * i * chunk_size * stride); } } } -FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OpUtils::SetOutputSliceData( - void *data, int64_t data_size, int32_t data_type, std::vector &input_dims, std::vector &begin, - std::vector &output_dims, GeTensor *output, std::vector &stride) { - GE_CHECK_NOTNULL(data); - GE_CHECK_NOTNULL(output); +template +Status OpUtils::SetDataByDataType(size_t out_size, const std::vector &chunk_input, + const std::vector &chunk_output, GeTensor *output) { + unique_ptr output_data(new (std::nothrow) T[out_size]()); + if (output_data == nullptr) { + GELOGE(MEMALLOC_FAILED, "New buf failed"); + return INTERNAL_ERROR; + } + + if (!chunk_input.empty()) { + for (size_t j = 0; j < out_size; j++) { + T *value = reinterpret_cast(chunk_input[j]); + output_data[j] = value[0]; + } + } else { + for (size_t j = 0; j < out_size; j++) { + T *value = reinterpret_cast(chunk_output[j]); + output_data[j] = value[0]; + } + } + + // output_data != nullptr and out_size > 0, SetData always return success, no need to check value + (void)output->SetData(reinterpret_cast(output_data.get()), out_size * sizeof(T)); + return SUCCESS; +} + +template +Status OpUtils::SetOutputSliceDataByDataType(void *data, int64_t data_size, const std::vector &input_dims, + const std::vector &begin, const std::vector &output_dims, + GeTensor *output, const std::vector &stride) { std::vector chunk_input; std::vector chunk_output; chunk_input.push_back(reinterpret_cast(data)); int64_t chunk_size = data_size; - int dim_size = static_cast(input_dims.size()); - for (int i = 0; i < dim_size; i++) { + size_t dim_size = input_dims.size(); + for (size_t i = 0; i < dim_size; i++) { int64_t begin_i = begin[i]; int64_t size_i = output_dims[i]; int64_t dim_i = input_dims[i]; int64_t stride_i = stride[i]; - GE_CHK_BOOL_EXEC((dim_i != 0), return PARAM_INVALID, "Dim_i can't be 0."); + if (dim_i == 0) { + GELOGE(PARAM_INVALID, "Dim_i of size tensor can't be 0."); + return PARAM_INVALID; + } chunk_size = chunk_size / dim_i; if (i % kSliceDataNum == 0) { - SliceData(chunk_input, chunk_size, chunk_output, begin_i, size_i, stride_i); + SliceData(chunk_input, chunk_size, chunk_output, begin_i, size_i, stride_i); chunk_input.clear(); } else { - SliceData(chunk_output, chunk_size, chunk_input, begin_i, size_i, stride_i); + SliceData(chunk_output, chunk_size, chunk_input, begin_i, size_i, stride_i); chunk_output.clear(); } } size_t out_size = chunk_input.size() + chunk_output.size(); GE_CHK_BOOL_RET_STATUS(out_size > 0, FAILED, "Out_size <= 0"); + Status ret = SetDataByDataType(out_size, chunk_input, chunk_output, output); + return ret; +} - if (data_type == DT_FLOAT) { - float *output_data = new (std::nothrow) float[out_size](); - GE_CHECK_NOTNULL(output_data); - if (!chunk_input.empty()) { - for (size_t j = 0; j < out_size; j++) { - float *value = reinterpret_cast(chunk_input[j]); - output_data[j] = *value; - } - } else { - for (size_t j = 0; j < out_size; j++) { - float *value = reinterpret_cast(chunk_output[j]); - output_data[j] = *value; - } - } - (void)output->SetData(reinterpret_cast(output_data), out_size * sizeof(float)); - // output_data != nullptr and out_size > 0, SetData always return success, no need to check value - GE_DELETE_NEW_ARRAY(output_data); - } else if (data_type == DT_INT32) { - int *output_data = new (std::nothrow) int[out_size](); - GE_CHECK_NOTNULL(output_data); - - if (!chunk_input.empty()) { - for (size_t j = 0; j < out_size; j++) { - int *value = reinterpret_cast(chunk_input[j]); - output_data[j] = *value; - } - } else { - for (size_t j = 0; j < out_size; j++) { - int *value = reinterpret_cast(chunk_output[j]); - output_data[j] = *value; - } - } - (void)output->SetData(reinterpret_cast(output_data), out_size * sizeof(int)); - // output_data != nullptr and out_size > 0, SetData always return success, no need to check value - GE_DELETE_NEW_ARRAY(output_data); - } else { - GELOGE(FAILED, "Data type of Slice OP must be float or int32."); - return FAILED; +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OpUtils::SetOutputSliceData( + void *data, int64_t data_size, int32_t data_type, std::vector &input_dims, std::vector &begin, + std::vector &output_dims, GeTensor *output, std::vector &stride) { + if (data == nullptr || output == nullptr) { + GELOGE(PARAM_INVALID, "Input param is nullptr."); + return PARAM_INVALID; } - return SUCCESS; + Status ret; + switch (data_type) { + case DT_INT32: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_FLOAT: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_DOUBLE: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_FLOAT16: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_UINT8: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_INT8: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_UINT16: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_INT16: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_UINT32: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_UINT64: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + case DT_INT64: + ret = SetOutputSliceDataByDataType(data, data_size, input_dims, begin, output_dims, output, stride); + break; + default: + GELOGW("Unsupported data type: %s", TypeUtils::DataTypeToSerialString(static_cast(data_type)).c_str()); + return PARAM_INVALID; + } + return ret; } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void OpUtils::TransDataHWCK2KCHW(const void *input, int64_t h, diff --git a/src/ge/common/profiling/profiling_manager.cc b/src/ge/common/profiling/profiling_manager.cc index bbe105b8..b4bab921 100644 --- a/src/ge/common/profiling/profiling_manager.cc +++ b/src/ge/common/profiling/profiling_manager.cc @@ -35,6 +35,7 @@ const char *const kEvents = "events"; const char *const kAiCoreEvents = "ai_core_events"; const char *const kName = "name"; const char *const kTraceID = "traceId"; +const size_t kReportMaxLen = 2048; } // namespace namespace ge { @@ -49,7 +50,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager &ProfilingMana FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options) { #ifdef DAVINCI_SUPPORT_PROFILING - device_id_ = options.device_id; + device_id_.push_back(options.device_id); job_id_ = options.job_id; Status ret; @@ -73,12 +74,14 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In return FAILED; } // profiling startup first time - ret = StartProfiling(0); - if (ret != SUCCESS) { - GELOGE(ret, "Profiling start failed."); - return FAILED; + for (size_t i = 0; i < device_id_.size(); ++i) { + ret = StartProfiling(0, device_id_[i]); + if (ret != SUCCESS) { + GELOGE(ret, "Profiling start failed."); + return FAILED; + } + GELOGI("Profiling init succ."); } - GELOGI("Profiling init succ."); } #endif return SUCCESS; @@ -94,6 +97,31 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In Json start_prof_conf = Json::parse(config); Json &prof_conf = start_prof_conf[kStartCfg][0]; job_id_ = prof_conf[kJobID]; + Json &device_id = prof_conf[kDeviceID]; + if (device_id.size() != 0) { + vector().swap(device_id_); + bool is_all = false; + for (size_t i = 0; i < device_id.size(); i++) { + std::string device_id_str = device_id[i].get(); + if (device_id_str == "all") { + is_all = true; + break; + } + device_id_.push_back(std::stoi(device_id_str)); + } + if (is_all == true) { + int32_t count = 0; + rtError_t rt_err = rtGetDeviceCount(&count); + if (rt_err != RT_ERROR_NONE) { + GELOGE(FAILED, "Call rtGetDeviceCount to get device failed."); + } + + vector().swap(device_id_); + for (int32_t i = 0; i < count; ++i) { + device_id_.push_back(i); + } + } + } GELOGI("Profiling json config from acl:%s", config.c_str()); Json &features = prof_conf[kFeatures]; @@ -107,8 +135,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In if (name == "op_trace") { GELOGI("Op trace config from acl"); Json &conf = feature[kConf]; - Json &events = conf[kEvents]; - const std::string &ai_core_events = events[kAiCoreEvents]; + Json &events = conf[0][kEvents]; + const std::string &ai_core_events = events[0][kAiCoreEvents]; GELOGI("Op trace config from acl ai_core_events:%s", ai_core_events.c_str()); is_op_trace_ = true; // op trace get conf @@ -124,6 +152,13 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In } else if (name == "task_trace") { is_op_trace_ = false; GELOGI("Task trace config from acl"); + } else if (name == "system_trace") { + is_op_trace_ = false; + Json &conf = feature[kConf]; + std::stringstream system_trace_conf; + system_trace_conf << conf; + system_trace_conf_ = system_trace_conf.str(); + GELOGI("System trace config from acl"); } profiling_opts_.push_back(name); } @@ -141,7 +176,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In #ifdef DAVINCI_SUPPORT_PROFILING const char *is_profiling = std::getenv("PROFILING_MODE"); const char *prof_options = std::getenv("PROFILING_OPTIONS"); - GELOGI("The profiling in options is %s, %s", is_profiling, prof_options); if ((is_profiling == nullptr) || (strcmp("true", is_profiling) != 0) || (prof_options == nullptr)) { // default training trace on is_profiling_ = false; @@ -151,6 +185,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In profiling_opts_ = StringUtils::Split(prof_options_str, ':'); is_profiling_ = true; } + GELOGI("The profiling in options is %s, %s", is_profiling, prof_options); // features:'training_trace', 'task_trace' or 'op_trace' etc if (!profiling_opts_.empty()) { @@ -175,7 +210,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In return ge::SUCCESS; } -FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::StartProfiling(int32_t iter_num) { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::StartProfiling(int32_t iter_num, + int32_t device_id) { #ifdef DAVINCI_SUPPORT_PROFILING if (!profiling_opts_.empty()) { GELOGI("Start profiling index is %d", iter_num); @@ -184,7 +220,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::St try { // profiling need physical_device_id - p_device[kDeviceID] = std::to_string(device_id_); + p_device[kDeviceID] = std::to_string(device_id); p_device[kJobID] = job_id_; p_device[kTraceID] = std::to_string(GetContext().TraceId()); @@ -197,7 +233,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::St GELOGE(FAILED, "Op trace iter num is invalid!"); return FAILED; } - conf = nlohmann::json::parse(op_trace_conf_[iter_num]); + Json events; + events[0] = nlohmann::json::parse(op_trace_conf_[iter_num]); + conf[0][kEvents] = events; f[kConf] = conf; features[0] = f; if (iter_num == 0) { @@ -206,6 +244,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::St } else { for (std::vector::size_type i = 0; i < profiling_opts_.size(); i++) { Json f; + if (profiling_opts_[i] == "system_trace") { + f[kConf] = nlohmann::json::parse(system_trace_conf_); + } f[kName] = profiling_opts_[i]; features[i] = f; } @@ -234,11 +275,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::St // call profiling startup API ProfMgrCfg prof_cfg = {send_profiling_config_}; - prof_handle = ProfMgrStartUp(&prof_cfg); + void *prof_handle = ProfMgrStartUp(&prof_cfg); if (prof_handle == nullptr) { GELOGW("ProfMgrStartUp failed."); return FAILED; } + prof_handle_vec_.push_back(prof_handle); } #endif return SUCCESS; @@ -257,45 +299,182 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProf GELOGI("Call rtProfilerStop ret:%d", rt_ret); } - if (prof_handle != nullptr) { - int result = ProfMgrStop(prof_handle); + for (size_t i = 0; i < prof_handle_vec_.size(); ++i) { + int result = ProfMgrStop(prof_handle_vec_[i]); if (result != 0) { GELOGW("ProfMgr stop return fail:%d.", result); return; } } + vector().swap(prof_handle_vec_); is_load_ = false; recv_profiling_config_ = ""; GELOGI("Stop Profiling success."); #endif } -FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportProfilingData( - const std::map &op_task_id_map) { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ProfilingTaskDescInfo( + const std::vector &task_desc_info, const int32_t &device_id) { #ifdef DAVINCI_SUPPORT_PROFILING Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter(); if (reporter == nullptr) { GELOGI("Profiling report is nullptr!"); return; } + std::string data; - for (const auto &iter : op_task_id_map) { - data = iter.second + ' ' + std::to_string(iter.first) + ';'; + for (const auto &task : task_desc_info) { + std::string op_name = task.op_name; + uint32_t block_dim = task.block_dim; + uint32_t task_id = task.task_id; + uint32_t stream_id = task.stream_id; + data = op_name.append(" ").append(std::to_string(block_dim) + .append(" ") + .append(std::to_string(task_id)) + .append(" ") + .append(std::to_string(stream_id)) + .append("\n")); + Msprof::Engine::ReporterData reporter_data{}; - reporter_data.deviceId = device_id_; + reporter_data.deviceId = device_id; reporter_data.data = (unsigned char *)data.c_str(); reporter_data.dataLen = data.size(); - int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "framework", sizeof("framework")); + int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "task_desc_info", sizeof("task_desc_info")); if (ret != EOK) { - GELOGE(ret, "Report data tag memcpy error!"); + GELOGE(ret, "Report data tag of task_desc_info memcpy error!"); return; } + ret = reporter->Report(&reporter_data); if (ret != SUCCESS) { - GELOGE(ret, "Reporter data fail!"); + GELOGE(ret, "Reporter data of task_desc_info fail!"); return; } } + + data.clear(); +#endif +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ProfilingGraphDescInfo( + const std::vector &compute_graph_desc_info, const int32_t &device_id) { +#ifdef DAVINCI_SUPPORT_PROFILING + Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter(); + GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return;); + + std::string data; + for (const auto &graph : compute_graph_desc_info) { + data.append("op_name:").append(graph.op_name).append(" op_type:").append(graph.op_type); + for (size_t i = 0; i < graph.input_format.size(); ++i) { + data.append(" input_id:") + .append(std::to_string(i)) + .append(" input_format:") + .append(std::to_string(graph.input_format.at(i))) + .append(" input_data_type:") + .append(std::to_string(graph.input_data_type.at(i))) + .append(" input_shape:\""); + size_t input_shape_len = graph.input_shape.at(i).size(); + if (input_shape_len == 0) { + data.append(""); + } else if (input_shape_len == 1) { + data.append(std::to_string(graph.input_shape.at(i).at(0))); + } else { + for (size_t j = 0; j < input_shape_len - 1; ++j) { + data.append(std::to_string(graph.input_shape.at(i).at(j))).append(","); + } + data.append(std::to_string(graph.input_shape.at(i).at(input_shape_len - 1))); + } + + data.append("\""); + } + + for (size_t i = 0; i < graph.output_format.size(); ++i) { + data.append(" output_id:") + .append(std::to_string(i)) + .append(" output_format:") + .append(std::to_string(graph.output_format.at(i))) + .append(" output_data_type:") + .append(std::to_string(graph.output_data_type.at(i))) + .append(" output_shape:\""); + size_t output_shape_len = graph.output_shape.at(i).size(); + if (output_shape_len == 0) { + data.append(""); + } else if (output_shape_len == 1) { + data.append(std::to_string(graph.output_shape.at(i).at(0))); + } else { + for (size_t j = 0; j < output_shape_len - 1; ++j) { + data.append(std::to_string(graph.output_shape.at(i).at(j))).append(","); + } + data.append(std::to_string(graph.output_shape.at(i).at(output_shape_len - 1))); + } + data.append("\""); + } + + data.append("\n"); + + Msprof::Engine::ReporterData reporter_data{}; + Report(device_id, data, *reporter, reporter_data); + + data.clear(); + } +#endif +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Report( + const int32_t &device_id, const string &data, Msprof::Engine::Reporter &reporter, + Msprof::Engine::ReporterData &reporter_data) { +#ifdef DAVINCI_SUPPORT_PROFILING + size_t index = data.size() / kReportMaxLen; + if (index >= 1) { + reporter_data.deviceId = device_id; + int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info")); + GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag of graph_desc_info memcpy error!"); return;); + for (size_t i = 0; i < index; ++i) { + reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * i; + reporter_data.dataLen = kReportMaxLen; + ret = reporter.Report(&reporter_data); + GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;); + } + reporter_data.dataLen = data.size() - kReportMaxLen * index; + if (reporter_data.dataLen != 0) { + reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * index; + ret = reporter.Report(&reporter_data); + GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;); + } + } else { + reporter_data.deviceId = device_id; + reporter_data.data = (unsigned char *)data.c_str(); + reporter_data.dataLen = data.size(); + int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info")); + GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag of graph_desc_info memcpy error!"); return;); + + ret = reporter.Report(&reporter_data); + GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;); + } +#endif +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportProfilingData( + const std::vector &task_desc_info, const std::vector &compute_graph_desc_info) { +#ifdef DAVINCI_SUPPORT_PROFILING + int32_t device_id = 0; + rtError_t rt_ret = rtGetDevice(&device_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(rt_ret, "runtime get device_id failed, current device_id:%d", device_id); + return; + } + GELOGI("current device_id:%d", device_id); + + auto ret = std::find(device_id_.begin(), device_id_.end(), device_id); + if (ret == device_id_.end()) { + GELOGE(FAILED, "get valid device_id failed, profiling report failed."); + return; + } + + GELOGI("start ProfilingTaskDescInfo."); + ProfilingTaskDescInfo(task_desc_info, device_id); + GELOGI("start ProfilingGraphDescInfo."); + ProfilingGraphDescInfo(compute_graph_desc_info, device_id); GELOGI("Report profiling data for GE end."); #endif } diff --git a/src/ge/common/profiling/profiling_manager.h b/src/ge/common/profiling/profiling_manager.h index 6b1645de..d3bfec63 100644 --- a/src/ge/common/profiling/profiling_manager.h +++ b/src/ge/common/profiling/profiling_manager.h @@ -40,14 +40,23 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { ge::Status Init(const Options &options); ge::Status InitFromEnv(const Options &options); ge::Status InitFromAclCfg(const std::string &config); - ge::Status StartProfiling(int32_t iter); + ge::Status StartProfiling(int32_t iter, int32_t device_id); void StopProfiling(); bool ProfilingOpTraceOn() const { return is_op_trace_; } bool ProfilingLoadFlag() const { return is_load_; } bool ProfilingOn() const { return is_profiling_; } int32_t GetOpTraceIterNum() const { return op_trace_iter_num_; } - void ReportProfilingData(const std::map &op_task_id_map); + + void ReportProfilingData(const std::vector &task_desc_info, + const std::vector &compute_graph_desc_info); + + void Report(const int32_t &device_id, const string &data, Msprof::Engine::Reporter &reporter, + Msprof::Engine::ReporterData &reporter_data); + void ProfilingTaskDescInfo(const std::vector &task_desc_info, const int32_t &device_id); + void ProfilingGraphDescInfo(const std::vector &compute_graph_desc_info, + const int32_t &device_id); void SetProfilingConfig(const string &profiling_cfg); + vector GetProfilingDeviceId() const { return device_id_; } private: bool is_profiling_ = false; @@ -55,17 +64,18 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { bool is_load_ = false; int32_t op_trace_iter_num_ = 0; string job_id_; - int32_t device_id_ = 0; + vector device_id_; vector op_trace_conf_; vector profiling_opts_; - void *prof_handle = nullptr; + vector prof_handle_vec_; string recv_profiling_config_; string send_profiling_config_; + string system_trace_conf_; }; -/// -/// @brief register Plugin -/// +/** + * @brief register Plugin + */ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY PluginImpl : public Msprof::Engine::PluginIntf { public: explicit PluginImpl(const std::string &module); @@ -80,9 +90,9 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY PluginImpl : public Mspro std::string module_; }; -/// -/// @brief register Engine -/// +/** + * @brief register Engine + */ class ProfilingEngineImpl : public Msprof::Engine::EngineIntf { public: ProfilingEngineImpl() {} diff --git a/src/ge/common/properties_manager.cc b/src/ge/common/properties_manager.cc index 7ec56473..b34f9463 100644 --- a/src/ge/common/properties_manager.cc +++ b/src/ge/common/properties_manager.cc @@ -20,18 +20,15 @@ #include #include +#include "common/util.h" #include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" #include "framework/common/ge_types.h" #include "framework/common/types.h" #include "graph/debug/ge_attr_define.h" #include "graph/utils/attr_utils.h" -#include "common/util.h" namespace ge { - -static const std::set black_list = {"IteratorV2"}; - PropertiesManager::PropertiesManager() : is_inited_(false), delimiter("=") {} PropertiesManager::~PropertiesManager() {} @@ -64,7 +61,7 @@ bool PropertiesManager::LoadFileContent(const std::string &file_path) { // Normalize the path string resolved_file_path = RealPath(file_path.c_str()); if (resolved_file_path.empty()) { - GE_LOGE("Invalid input file path [%s], make sure that the file path is correct.", file_path.c_str()); + DOMI_LOGE("Invalid input file path [%s], make sure that the file path is correct.", file_path.c_str()); return false; } std::ifstream fs(resolved_file_path, std::ifstream::in); @@ -189,6 +186,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::ClearDu model_dump_properties_map_.clear(); } +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::set PropertiesManager::GetAllDumpModel() { + std::set model_list; + std::lock_guard lock(dump_mutex_); + for (auto &iter : model_dump_properties_map_) { + model_list.insert(iter.first); + } + + return model_list; +} + FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::set PropertiesManager::GetDumpPropertyValue( const std::string &model) { std::lock_guard lock(dump_mutex_); @@ -202,11 +209,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::set Propertie FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool PropertiesManager::IsLayerNeedDump(const std::string &model, const std::string &op_name) { std::lock_guard lock(dump_mutex_); - - if (black_list.find(op_name) != black_list.end()) { - return false; - } - // if dump all if (model_dump_properties_map_.find(ge::DUMP_ALL_MODEL) != model_dump_properties_map_.end()) { return true; @@ -260,4 +262,15 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string PropertiesManager:: std::lock_guard lock(dump_mutex_); return this->output_path_; } + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void PropertiesManager::SetDumpStep(const std::string &dump_step) { + std::lock_guard lock(dump_mutex_); + this->dump_step_ = dump_step; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string PropertiesManager::GetDumpStep() { + std::lock_guard lock(dump_mutex_); + return this->dump_step_; +} + } // namespace ge diff --git a/src/ge/common/properties_manager.h b/src/ge/common/properties_manager.h index 6c4b2072..100b83f0 100644 --- a/src/ge/common/properties_manager.h +++ b/src/ge/common/properties_manager.h @@ -82,6 +82,7 @@ class PropertiesManager { void SetPropertyDelimiter(const std::string &de); void AddDumpPropertyValue(const std::string &model, const std::set &layers); + std::set GetAllDumpModel(); std::set GetDumpPropertyValue(const std::string &model); bool IsLayerNeedDump(const std::string &model, const std::string &op_name); void DeleteDumpPropertyValue(const std::string &model); @@ -91,6 +92,8 @@ class PropertiesManager { std::string GetDumpOutputModel(); void SetDumpOutputPath(const std::string &output_path); std::string GetDumpOutputPath(); + void SetDumpStep(const std::string &dump_step); + std::string GetDumpStep(); private: // Private construct, destructor @@ -116,6 +119,7 @@ class PropertiesManager { std::string output_mode_; std::string output_path_; + std::string dump_step_; std::map> model_dump_properties_map_; // model_dump_layers_map_ std::mutex dump_mutex_; }; diff --git a/src/ge/common/thread_pool.h b/src/ge/common/thread_pool.h index ea9227bf..92157275 100644 --- a/src/ge/common/thread_pool.h +++ b/src/ge/common/thread_pool.h @@ -52,8 +52,8 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ThreadPool { return fail_future; } - auto bind_func = std::bind(std::forward(func), std::forward(args)...); - auto task = ge::MakeShared>(bind_func); + auto bindFunc = std::bind(std::forward(func), std::forward(args)...); + auto task = ge::MakeShared>(bindFunc); if (task == nullptr) { GELOGE(ge::FAILED, "Make shared failed."); return fail_future; diff --git a/src/ge/common/types.cc b/src/ge/common/types.cc index e36c147f..da0853b6 100644 --- a/src/ge/common/types.cc +++ b/src/ge/common/types.cc @@ -15,7 +15,6 @@ */ #include "framework/common/types.h" - #include "graph/types.h" namespace ge { @@ -116,18 +115,19 @@ REGISTER_OPTYPE_DEFINE(GATHER, "Gather"); REGISTER_OPTYPE_DEFINE(REALDIV, "RealDiv"); REGISTER_OPTYPE_DEFINE(PACK, "Pack"); REGISTER_OPTYPE_DEFINE(SLICE, "Slice"); +REGISTER_OPTYPE_DEFINE(SLICED, "SliceD"); REGISTER_OPTYPE_DEFINE(FLOORDIV, "FloorDiv"); REGISTER_OPTYPE_DEFINE(SQUEEZE, "Squeeze"); REGISTER_OPTYPE_DEFINE(STRIDEDSLICE, "StridedSlice"); REGISTER_OPTYPE_DEFINE(RANGE, "Range"); -REGISTER_OPTYPE_DEFINE(RPNPROPOSALS, "GenerateRpnProposals"); -REGISTER_OPTYPE_DEFINE(DECODEBBOX, "DecodeBBox"); +REGISTER_OPTYPE_DEFINE(RPNPROPOSALS, "RpnProposals"); +REGISTER_OPTYPE_DEFINE(DECODEBBOX, "DecodeBbox"); REGISTER_OPTYPE_DEFINE(PAD, "Pad"); REGISTER_OPTYPE_DEFINE(PADV2, "PadV2"); REGISTER_OPTYPE_DEFINE(MIRRORPAD, "MirrorPad"); REGISTER_OPTYPE_DEFINE(TILE, "Tile"); REGISTER_OPTYPE_DEFINE(SIZE, "Size"); -REGISTER_OPTYPE_DEFINE(CLIPBOXES, "Clipboxes"); +REGISTER_OPTYPE_DEFINE(CLIPBOXES, "ClipBoxes"); REGISTER_OPTYPE_DEFINE(FASTRCNNPREDICTIONS, "FastrcnnPredictions"); REGISTER_OPTYPE_DEFINE(SPLIT, "Split"); REGISTER_OPTYPE_DEFINE(SPLITV, "SplitV"); @@ -138,6 +138,19 @@ REGISTER_OPTYPE_DEFINE(GREATER, "Greater"); REGISTER_OPTYPE_DEFINE(SWITCH, "Switch"); REGISTER_OPTYPE_DEFINE(SWITCHN, "SwitchN"); REGISTER_OPTYPE_DEFINE(MERGE, "Merge"); +REGISTER_OPTYPE_DEFINE(SYMBOLICGRADIENT, "SymbolicGradient"); +REGISTER_OPTYPE_DEFINE(REMOTECALL, "RemoteCall"); +REGISTER_OPTYPE_DEFINE(_IF, "_If"); +REGISTER_OPTYPE_DEFINE(STATELESSIF, "StatelessIf"); +REGISTER_OPTYPE_DEFINE(IF, "If"); +REGISTER_OPTYPE_DEFINE(CASE, "Case"); +REGISTER_OPTYPE_DEFINE(_WHILE, "_While"); +REGISTER_OPTYPE_DEFINE(WHILE, "While"); +REGISTER_OPTYPE_DEFINE(STATELESSWHILE, "StatelessWhile"); +REGISTER_OPTYPE_DEFINE(FOR, "For"); +REGISTER_OPTYPE_DEFINE(PARTITIONEDCALL, "PartitionedCall"); +REGISTER_OPTYPE_DEFINE(STATEFULPARTITIONEDCALL, "StatefulPartitionedCall"); +REGISTER_OPTYPE_DEFINE(FAKEPARAM, "FakeParam"); REGISTER_OPTYPE_DEFINE(TRANSPOSE, "Transpose"); REGISTER_OPTYPE_DEFINE(TRANSPOSED, "TransposeD"); REGISTER_OPTYPE_DEFINE(CAST, "Cast"); @@ -275,7 +288,7 @@ REGISTER_OPTYPE_DEFINE(BASICLSTMCELL, "BasicLSTMCell"); REGISTER_OPTYPE_DEFINE(GETNEXT, "GetNext"); REGISTER_OPTYPE_DEFINE(INITDATA, "InitData"); -// Ann special operator +/***************Ann special operator*************************/ REGISTER_OPTYPE_DEFINE(ANN_MEAN, "AnnMean"); REGISTER_OPTYPE_DEFINE(ANN_CONVOLUTION, "AnnConvolution"); REGISTER_OPTYPE_DEFINE(ANN_DEPCONVOLUTION, "AnnDepthConv"); @@ -292,7 +305,8 @@ REGISTER_OPTYPE_DEFINE(ANN_QUANTIZE, "AnnQuant"); REGISTER_OPTYPE_DEFINE(ANN_PAD, "AnnPad"); REGISTER_OPTYPE_DEFINE(ANN_RESIZE_BILINEAR, "AnnResizeBilinear"); -// Training operator +/***************************************************/ +/******************Training operator*************************/ REGISTER_OPTYPE_DEFINE(GATHERV2, "GatherV2"); REGISTER_OPTYPE_DEFINE(CONVGRADFILTER, "Conv2DBackpropFilter"); REGISTER_OPTYPE_DEFINE(CONV2D, "Conv2D"); @@ -376,6 +390,11 @@ REGISTER_OPTYPE_DEFINE(ENDGRAPH, "EndGraph"); REGISTER_OPTYPE_DEFINE(SEND, "Send"); REGISTER_OPTYPE_DEFINE(RECV, "Recv"); +REGISTER_OPTYPE_DEFINE(LABELSET, "LabelSet"); +REGISTER_OPTYPE_DEFINE(LABELGOTO, "LabelGoto"); +REGISTER_OPTYPE_DEFINE(LABELSWITCH, "LabelSwitch"); +REGISTER_OPTYPE_DEFINE(LABELSWITCHBYINDEX, "LabelSwitchByIndex"); + REGISTER_OPTYPE_DEFINE(ATOMICADDRCLEAN, "AtomicAddrClean"); REGISTER_OPTYPE_DEFINE(ABS_GRAD, "AbsGrad"); @@ -448,174 +467,310 @@ const uint64_t ALLOC_MEMORY_MAX_SIZE = 8589934592; // Max size of 8 GB. const uint64_t ALLOC_MEMORY_MAX_SIZE = 536870912; // Max size of 512M. #endif -// Magic number of model file +/// +///@brief Magic number of model file +/// const uint32_t MODEL_FILE_MAGIC_NUM = 0x444F4D49; // magic number -// Model head length +/// +///@brief Model head length +/// const uint32_t MODEL_FILE_HEAD_LEN = 256; -// Input node type +/// +///@ingroup domi_omg +///@brief Input node type +/// const std::string INPUT_TYPE = "Input"; -// AIPP label, label AIPP conv operator +/// +///@ingroup domi_omg +///@brief AIPP label, label AIPP conv operator +/// const std::string AIPP_CONV_FLAG = "Aipp_Conv_Flag"; -// AIPP label, label aipp data operator +/// +///@ingroup domi_omg +///@brief AIPP label, label aipp data operator +/// const std::string AIPP_DATA_FLAG = "Aipp_Data_Flag"; -// Record the w dimension of model input corresponding to dynamic AIPP +/// +///@ingroup domi_omg +///@brief Record the w dimension of model input corresponding to dynamic AIPP +/// const std::string AIPP_RELATED_DATA_DIM_W = "aipp_related_data_dim_w"; -// Record the H dimension of model input corresponding to dynamic AIPP +/// +///@ingroup domi_omg +///@brief Record the H dimension of model input corresponding to dynamic AIPP +/// const std::string AIPP_RELATED_DATA_DIM_H = "aipp_related_data_dim_h"; -// The tag of the data operator. Mark this input to the dynamic AIPP operator +/// +///@ingroup domi_omg +///@brief The tag of the data operator. Mark this input to the dynamic AIPP operator +/// const std::string INPUT_TO_DYNAMIC_AIPP = "input_to_dynamic_aipp"; -// DATA node type +/// +///@ingroup domi_omg +///@brief DATA node type +/// const std::string DATA_TYPE = "Data"; -// DATA node type +/// +///@ingroup domi_omg +///@brief DATA node type +/// const std::string AIPP_DATA_TYPE = "AippData"; -// Frame operator type +/// +///@ingroup domi_omg +///@brief Frame operator type +/// const std::string FRAMEWORK_OP_TYPE = "FrameworkOp"; -// Data node type +/// +///@ingroup domi_omg +///@brief Data node type +/// const std::string ANN_DATA_TYPE = "AnnData"; const std::string ANN_NETOUTPUT_TYPE = "AnnNetOutput"; const std::string ANN_DEPTHCONV_TYPE = "AnnDepthConv"; const std::string ANN_CONV_TYPE = "AnnConvolution"; const std::string ANN_FC_TYPE = "AnnFullConnection"; -// Convolution node type +/// +///@ingroup domi_omg +///@brief Convolution node type +/// const std::string NODE_NAME_NET_OUTPUT = "Node_Output"; const std::string NODE_NAME_END_GRAPH = "Node_EndGraph"; -// Convolution node type +/// +///@ingroup domi_omg +///@brief Convolution node type +/// const std::string OP_TYPE_CONVOLUTION = "Convolution"; -// Add convolution node name to AIPP +/// +///@ingroup domi_omg +///@brief Add convolution node name to AIPP +/// const std::string AIPP_CONV_OP_NAME = "aipp_conv_op"; -// Operator configuration item separator +/// +///@ingroup domi_omg +///@brief Operator configuration item separator +/// const std::string OP_CONF_DELIMITER = ":"; -// attr value name +/// +///@ingroup domi_omg +///@brief attr value name +/// const std::string ATTR_NAME_VALUE1 = "value1"; -// attr value name, 6d_2_4d C +/// +///@ingroup domi_omg +///@brief attr value name, 6d_2_4d C +/// const std::string ATTR_NAME_INPUT_CVALUE = "input_cvalue"; -// alpha default value +/// +///@ingroup domi_omg +///@brief alpha default value +/// const float ALPHA_DEFAULT_VALUE = 1.0; -// beta default value +/// +///@ingroup domi_omg +///@brief beta default value +/// const float BETA_DEFAULT_VALUE = 0.0; -// coef default value +/// +///@ingroup domi_omg +///@brief coef default value +/// const float COEF_DEFAULT_VALUE = 0.0; -// Relu6 coef value +/// +///@ingroup domi_omg +///@brief Relu6 coef value +/// const float RELU6_COEF = 6.0; -// stride default value +/// +///@ingroup domi_omg +///@brief stride default value +/// const uint32_t STRIDE_DEFAULT_VALUE = 1; -// pad default value +/// +///@ingroup domi_omg +///@brief pad default value +/// const uint32_t PAD_DEFAULT_VALUE = 0; -// dilation default value +/// +///@ingroup domi_omg +///@brief dilation default value +/// const int DILATION_DEFAULT_VALUE = 1; -// kernel default value +/// +///@ingroup domi_omg +///@brief kernel default value +/// const uint32_t KERNEL_DEFAULT_VALUE = 0; -// defaule convolution group size +/// +///@ingroup domi_omg +///@brief defaule convolution group size +/// const uint32_t DEFAULT_CONV_GROUP = 1; -// Default deconvolution adj +/// +///@ingroup domi_omg +///@brief Default deconvolution adj +/// const uint32_t DEFAULT_DECONV_ADJ = 0; -// Represents value 1 +/// +///@ingroup domi_omg +///@brief Represents value 1 +/// const uint32_t NUM_ONE = 1; -// spatial dim size default value +/// +///@ingroup domi_omg +///@brief spatial dim size default value +/// const int32_t SPATIAL_DIM_DEFAULT_SIZE = 2; -// dim extended default value +/// +///@ingroup domi_omg +///@brief dim extended default value +/// const int32_t DIM_DEFAULT_VALUE = 1; -// The first weight list in opdef is filter +/// +///@ingroup domi_omg +///@brief The first weight list in opdef is filter +/// const int32_t WEIGHT_FILTER_INDEX = 0; -// The second weight list in opdef is bias +/// +///@ingroup domi_omg +///@brief The second weight list in opdef is bias +/// const int32_t WEIGHT_BIAS_INDEX = 1; const int32_t TENSOR_ND_SUPPORT_SIZE = 8; -// NCHW index default value +/// +///@ingroup domi_omg +///@brief NCHW index default value +/// const uint32_t NCHW_DIM_N = 0; const uint32_t NCHW_DIM_C = 1; const uint32_t NCHW_DIM_H = 2; const uint32_t NCHW_DIM_W = 3; -// KCHW index default value +/// +///@ingroup domi_omg +///@brief KCHW index default value +/// const uint32_t KCHW_DIM_K = 0; const uint32_t KCHW_DIM_C = 1; const uint32_t KCHW_DIM_H = 2; const uint32_t KCHW_DIM_W = 3; -// HWCK index default value +/// +///@ingroup domi_omg +///@brief HWCK index default value +/// const uint32_t HWCK_DIM_H = 0; const uint32_t HWCK_DIM_W = 1; const uint32_t HWCK_DIM_C = 2; const uint32_t HWCK_DIM_K = 3; -// NHWC index default value +/// +///@ingroup domi_omg +///@brief NHWC index default value +/// const uint32_t NHWC_DIM_N = 0; const uint32_t NHWC_DIM_H = 1; const uint32_t NHWC_DIM_W = 2; const uint32_t NHWC_DIM_C = 3; -// CHWN index default value +/// +///@ingroup domi_omg +///@brief CHWN index default value +/// const uint32_t CHWN_DIM_N = 3; const uint32_t CHWN_DIM_C = 0; const uint32_t CHWN_DIM_H = 1; const uint32_t CHWN_DIM_W = 2; -// CHW index default value +/// +///@ingroup domi_omg +///@brief CHW index default value +/// const uint32_t CHW_DIM_C = 0; const uint32_t CHW_DIM_H = 1; const uint32_t CHW_DIM_W = 2; -// HWC index default value +/// +///@ingroup domi_omg +///@brief HWC index default value +/// const uint32_t HWC_DIM_H = 0; const uint32_t HWC_DIM_W = 1; const uint32_t HWC_DIM_C = 2; -// Pad index default value +/// +///@ingroup domi_omg +///@brief Pad index default value +/// const uint32_t PAD_H_HEAD = 0; const uint32_t PAD_H_TAIL = 1; const uint32_t PAD_W_HEAD = 2; const uint32_t PAD_W_TAIL = 3; -// window index default value +/// +///@ingroup domi_omg +///@brief window index default value +/// const uint32_t WINDOW_H = 0; const uint32_t WINDOW_W = 1; -// stride index default value +/// +///@ingroup domi_omg +///@brief stride index default value +/// const uint32_t STRIDE_H = 0; const uint32_t STRIDE_W = 1; -// dilation index default value +/// +///@ingroup domi_omg +///@brief dilation index default value +/// const uint32_t DILATION_H = 0; const uint32_t DILATION_W = 1; -// the num of XRBG channel +/// +///@ingroup domi_omg +///@brief the num of XRBG channel +/// const uint32_t XRGB_CHN_NUM = 4; -// global pooling default value +/// +///@ingroup domi_omg +///@brief global pooling default value +/// const bool DEFAULT_GLOBAL_POOLING = false; -const uint32_t MODEL_VERSION = 0x10000000; /**< Model version 1.0 */ +const uint32_t MODEL_VERSION = 0x10000000; ///< Model version 1.0/// // Eltwise's input size const int ELTWISE_MIN_INPUT_SIZE = 2; diff --git a/src/ge/common/util.cc b/src/ge/common/util.cc index b5a730bc..79ead57b 100644 --- a/src/ge/common/util.cc +++ b/src/ge/common/util.cc @@ -49,7 +49,7 @@ namespace { * If such an exception is encountered during operation, * the proto file can be divided into several small files or the limit value can be increased. */ -const int kProtoReadBytesLimit = INT_MAX; // Max size of 2 GB minus 1 byte. +const int kProtoReadBytesLimit = INT_MAX; // Max size of 2 GB minus 1 byte. const int kWarningThreshold = 536870912 * 2; // 536870912 represent 512M /// The maximum length of the file. @@ -96,7 +96,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromBinaryFile(co } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromArray(const void *data, int size, Message *proto) { - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((proto == nullptr|| data == nullptr || size == 0), return false, + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((proto == nullptr || data == nullptr || size == 0), return false, "incorrect parameter. proto is nullptr || data is nullptr || size is 0"); google::protobuf::io::CodedInputStream coded_stream(reinterpret_cast(const_cast(data)), size); @@ -176,10 +176,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadBytesFromBinaryFile(co GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > kMaxFileSizeLimit, file.close(); return false, "file size %ld is out of limit: %d.", size, kMaxFileSizeLimit); - file.seekg(0, std::ios::beg); + file.seekg(0, std::ios::beg); // [no need to check value] - buffer.resize(static_cast(size)); - file.read(&buffer[0], size); + buffer.resize(static_cast(size)); // [no need to check value] + file.read(&buffer[0], size); // [no need to check value] file.close(); GELOGI("Read size:%ld", size); return true; @@ -261,7 +261,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromText(const ch google::protobuf::io::IstreamInputStream input(&fs); bool ret = google::protobuf::TextFormat::Parse(&input, message); GE_IF_BOOL_EXEC( - !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file.")); + !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file.")); fs.close(); return ret; @@ -277,7 +277,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromMem(const cha google::protobuf::io::IstreamInputStream input(&fs); bool ret = google::protobuf::TextFormat::Parse(&input, message); GE_IF_BOOL_EXEC( - !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file.")); + !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file.")); return ret; } @@ -344,28 +344,27 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const } // A regular matching expression to verify the validity of the input file path - // ^(/|./|(../)+|)([.]?[A-Za-z0-9_-]+/)*[A-Za-z0-9_+.-]+$ - // Path section:Support upper and lower case letters, numbers and underscores - // File name section:Support upper and lower case letters, numbers, underscores and dots(.) - std::string mode = "^(/+|./+|(../+)+|)(../|([.]?[A-Za-z0-9_-]+)/+)*[A-Za-z0-9_+.-]+$"; - - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!ValidateStr(file_path, mode), return false, - "input path [%s] is with illegal character. path can only be composed of upper and " - "lower case letters, numbers, minus sign(-) and underscores; filename can only be " - "composed of upper and lower case letters, numbers, underscores, dot(.), plus " - "sign(+) and minus sign(-).", - file_path.c_str()); + // ^(/|./|(../)+|)([.]?[\u4e00-\u9fa5A-Za-z0-9_.-]+/)*[\u4e00-\u9fa5A-Za-z0-9_+.-]+$ + // Path section:Support upper and lower case letters, numbers dots(.) chinese and underscores + // File name section:Support upper and lower case letters, numbers, underscores chinese and dots(.) + std::string mode = "^(/+|./+|(../+)+|)(../|([.]?[\u4e00-\u9fa5A-Za-z0-9_.-]+)/+)*[\u4e00-\u9fa5A-Za-z0-9_+.-]+$"; + + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + !ValidateStr(file_path, mode), return false, + "input [%s] is illegal. path can only contains 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese; filename can " + "only contains 'a-z' 'A-Z' '0-9' '_' '.' '+' '-' and chinese", + file_path.c_str()); std::string real_path = RealPath(file_path.c_str()); // Unable to get absolute path (does not exist or does not have permission to access) if (real_path.empty()) { - GELOGE(ge::FAILED, "Can not get real path for %s.", file_path.c_str()); + GELOGE(ge::FAILED, "Can not get real path for %s, %s", file_path.c_str(), strerror(errno)); return false; } // The absolute path points to a file that is not readable if (access(real_path.c_str(), R_OK) != 0) { - GELOGE(ge::FAILED, "Can not read file in %s.", file_path.c_str()); + GELOGE(ge::FAILED, "Can not read file in %s, %s", file_path.c_str(), strerror(errno)); return false; } @@ -380,24 +379,23 @@ FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) } // A regular matching expression to verify the validity of the input file path - // ^(/|./|(../)+|)([.]?[A-Za-z0-9_-]+/)*[A-Za-z0-9_+.-]+$ - // Path section:Support upper and lower case letters, numbers and underscores - // File name section:Support upper and lower case letters, numbers, underscores and dots(.) - std::string mode = "^(/+|./+|(../+)+|)(../|([.]?[A-Za-z0-9_-]+)/+)*[A-Za-z0-9_+.-]+$"; - - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(!ValidateStr(file_path, mode), return false, - "input path [%s] is with illegal character. path can only be composed of upper and " - "lower case letters, numbers, minus sign(-) and underscores; filename can only be " - "composed of upper and lower case letters, numbers, underscores, dot(.), plus " - "sign(+) and minus sign(-).", - file_path.c_str()); + // ^(/|./|(../)+|)([.]?[\u4e00-\u9fa5A-Za-z0-9_-]+/)*[\u4e00-\u9fa5A-Za-z0-9_+.-]+$ + // Path section:Support upper and lower case letters, numbers dots(.) chinese and underscores + // File name section:Support upper and lower case letters, numbers, underscores chinese and dots(.) + std::string mode = "^(/+|./+|(../+)+|)(../|([.]?[\u4e00-\u9fa5A-Za-z0-9_.-]+)/+)*[\u4e00-\u9fa5A-Za-z0-9_+.-]+$"; + + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( + !ValidateStr(file_path, mode), return false, + "output [%s] is illegal. path can only contains 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese; filename can " + "only contains 'a-z' 'A-Z' '0-9' '_' '.' '+' '-' and chinese", + file_path.c_str()); std::string real_path = RealPath(file_path.c_str()); // Can get absolute path (file exists) if (!real_path.empty()) { // File is not readable or writable if (access(real_path.c_str(), R_OK | W_OK | F_OK) != 0) { - GELOGE(ge::FAILED, "Path[ %s ] exists, but can not be write.", file_path.c_str()); + GELOGE(ge::FAILED, "Path[ %s ] exists, but can not be write, %s", file_path.c_str(), strerror(errno)); return false; } } else { diff --git a/src/ge/engine_manager/dnnengine_manager.cc b/src/ge/engine_manager/dnnengine_manager.cc index 7c08e4d3..1eb38489 100644 --- a/src/ge/engine_manager/dnnengine_manager.cc +++ b/src/ge/engine_manager/dnnengine_manager.cc @@ -37,6 +37,7 @@ const char *const kIndependent = "independent"; const char *const kSkipAssignStream = "skip_assign_stream"; const char *const kCalEngines = "cal_engines"; const char *const kAttch = "attach"; +const char *const kVectorCore = "VectorCore"; const char *const kVectorEngine = "VectorEngine"; const char *const kAIcoreEngine = "AIcoreEngine"; const char *const kCustomOpFlag = "_custom_op_flag"; @@ -151,7 +152,7 @@ std::shared_ptr DNNEngineManager::GetEngine(const std::string &na return nullptr; } -bool DNNEngineManager::IsEngineRegistered(const std::string &name) const { +bool DNNEngineManager::IsEngineRegistered(const std::string &name) { auto iter = engines_map_.find(name); if (iter != engines_map_.end()) { return true; @@ -160,11 +161,9 @@ bool DNNEngineManager::IsEngineRegistered(const std::string &name) const { return false; } -std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) const { - if (op_desc == nullptr) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: op_desc is nullptr"); - return ""; - } +std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) { + GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGE(GE_CLI_GE_NOT_INITIALIZED, "DNNEngineManager: op_desc is nullptr"); + return ""); // Use the OpsKernelManager in GELib to get the opInfos for this opCode std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) { @@ -182,7 +181,7 @@ std::string DNNEngineManager::GetDNNEngineName(const OpDescPtr &op_desc) const { if (ret != SUCCESS) { GELOGD("get the option CORE_TYPE fail, set it to default value VECTOR_ENGINE"); } - string exclude_core_Type = (ge_core_type == kVectorEngine) ? kAIcoreEngine : kVectorEngine; + string exclude_core_Type = (ge_core_type == kVectorCore) ? kAIcoreEngine : kVectorEngine; GELOGD("engine type will exclude: %s", exclude_core_Type.c_str()); std::map unsupported_reasons; for (const auto &it : op_infos) { @@ -371,7 +370,7 @@ Status DNNEngineManager::ReadJsonFile(const std::string &file_path, JsonHandle h const char *file = file_path.data(); if ((access(file, F_OK)) == -1) { if (engines_map_.size() != 0) { - GELOGE(FAILED, "The json file %s is not exist", file_path.c_str()); + GELOGE(FAILED, "The json file %s is not exist, %s", file_path.c_str(), strerror(errno)); return FAILED; } else { GELOGW("The json file %s is not need", file_path.c_str()); diff --git a/src/ge/engine_manager/dnnengine_manager.h b/src/ge/engine_manager/dnnengine_manager.h index f4b1b551..ab813398 100644 --- a/src/ge/engine_manager/dnnengine_manager.h +++ b/src/ge/engine_manager/dnnengine_manager.h @@ -59,9 +59,9 @@ class DNNEngineManager { public: friend class GELib; std::shared_ptr GetEngine(const std::string &name) const; - bool IsEngineRegistered(const std::string &name) const; + bool IsEngineRegistered(const std::string &name); // If can't find appropriate engine name, return "", report error - string GetDNNEngineName(const OpDescPtr &op_desc) const; + string GetDNNEngineName(const OpDescPtr &op_desc); const map &GetSchedulers() const; private: diff --git a/src/ge/executor/CMakeLists.txt b/src/ge/executor/CMakeLists.txt index 265ae5ee..7401b062 100755 --- a/src/ge/executor/CMakeLists.txt +++ b/src/ge/executor/CMakeLists.txt @@ -30,6 +30,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../common/profiling/profiling_manager.cc" "../graph/execute/graph_execute.cc" "../graph/load/graph_loader.cc" + "../graph/load/new_model_manager/cpu_queue_schedule.cc" "../graph/load/new_model_manager/data_dumper.cc" "../graph/load/new_model_manager/data_inputer.cc" "../graph/load/new_model_manager/davinci_model.cc" @@ -50,6 +51,9 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "../graph/load/new_model_manager/task_info/stream_active_task_info.cc" "../graph/load/new_model_manager/task_info/stream_switch_task_info.cc" + "../graph/load/new_model_manager/task_info/stream_switchn_task_info.cc" + "../graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc" + "../graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc" "../graph/load/new_model_manager/task_info/task_info.cc" "../graph/load/new_model_manager/tbe_handle_store.cc" "../graph/load/output/output.cc" diff --git a/src/ge/executor/ge_executor.cc b/src/ge/executor/ge_executor.cc index 555cef07..7342f1a7 100644 --- a/src/ge/executor/ge_executor.cc +++ b/src/ge/executor/ge_executor.cc @@ -15,17 +15,17 @@ */ #include "executor/ge_executor.h" - #include #include #include #include - #include "common/debug/log.h" -#include "framework/common/debug/ge_log.h" #include "common/ge/ge_util.h" #include "common/helper/model_helper.h" +#include "common/profiling/profiling_manager.h" #include "common/util.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/util.h" #include "graph/execute/graph_execute.h" #include "graph/load/graph_loader.h" #include "graph/load/new_model_manager/davinci_model_parser.h" @@ -35,31 +35,15 @@ #include "graph/utils/graph_utils.h" #include "mmpa/mmpa_api.h" #include "single_op/single_op_manager.h" -#include "framework/common/util.h" -#include "common/profiling/profiling_manager.h" namespace { -const uint64_t kDynamicImageSizeParamNum = 2; -} // namespace - -namespace ge { -bool GeExecutor::is_init_ = false; - -class ModelListenerAdapter : public ModelListener { - public: - domi::Status OnComputeDone(uint32_t model_id, uint32_t data_index, uint32_t result_code) { - if (listener == nullptr) { - GELOGE(ge::FAILED, "listener is null."); - return FAILED; - } - return listener->OnComputeDone(model_id, data_index, result_code); - } - - std::shared_ptr listener; -}; +const size_t kDynamicBatchSizeVecSize = 1; +const size_t kDynamicImageSizeVecSize = 2; +const size_t kDynamicImageSizeInputSize = 2; +const char *const kBatchLabel = "Batch_"; -ge::Status TransferDomiErrorCode(const uint32_t error_code) { - switch (error_code) { +ge::Status TransferDomiErrorCode(const uint32_t errorCode) { + switch (errorCode) { case ge::PARAM_INVALID: case domi::PARAM_INVALID: return ge::PARAM_INVALID; @@ -72,19 +56,19 @@ ge::Status TransferDomiErrorCode(const uint32_t error_code) { } void GetGeTensorDescFromDomiInfo(std::vector &ge_descs, - const std::vector &domi_descs, + const std::vector &domi_descs, const std::vector &formats) { uint32_t idx = 0; for (auto desc_item : domi_descs) { ge::TensorDesc ge_desc; ge_desc.SetName(desc_item.name); - ge_desc.SetDataType(static_cast(desc_item.data_type)); + ge_desc.SetDataType(static_cast(desc_item.data_type)); ge_desc.SetFormat(static_cast(formats[idx])); std::vector shape_dims; for (auto dim : desc_item.shape_info.dims) { shape_dims.push_back(dim); } - Shape ge_shape(shape_dims); + ge::Shape ge_shape(shape_dims); ge_desc.SetShape(ge_shape); ge_desc.SetSize(desc_item.size); ge_descs.emplace_back(ge_desc); @@ -92,32 +76,111 @@ void GetGeTensorDescFromDomiInfo(std::vector &ge_descs, } } -void GetDomiInputData(const ge::RunModelData &input_data, InputData &inputs) { +void GetDomiInputData(const ge::RunModelData &input_data, ge::InputData &inputs) { inputs.index = input_data.index; - inputs.model_id = input_data.model_id; + inputs.model_id = input_data.modelId; inputs.timestamp = input_data.timestamp; inputs.timeout = input_data.timeout; inputs.request_id = input_data.request_id; for (const auto &data_item : input_data.blobs) { - DataBuffer data_buf{data_item.data, data_item.length, data_item.isDataSupportMemShare}; - inputs.blobs.emplace_back(data_buf); + ge::DataBuffer dataBuf{data_item.data, data_item.length, data_item.isDataSupportMemShare}; + inputs.blobs.emplace_back(dataBuf); } } -void GetDomiOutputData(const ge::RunModelData &output_data, OutputData &outputs) { +void GetDomiOutputData(const ge::RunModelData &output_data, ge::OutputData &outputs) { outputs.index = output_data.index; - outputs.model_id = output_data.model_id; + outputs.model_id = output_data.modelId; for (const auto &data_item : output_data.blobs) { - DataBuffer data_buf(data_item.data, data_item.length, data_item.isDataSupportMemShare); - outputs.blobs.emplace_back(data_buf); + ge::DataBuffer dataBuf(data_item.data, data_item.length, data_item.isDataSupportMemShare); + outputs.blobs.emplace_back(dataBuf); + } +} + +void SetDynamicInputDataFlag(const ge::RunModelData &input_data, const std::vector> batch_info, + ge::InputData &inputs) { + inputs.is_dynamic_batch = true; + std::string batch_label; + for (size_t i = 0; i < batch_info.size(); ++i) { + if (batch_info[i].size() == kDynamicBatchSizeVecSize && + batch_info[i][0] == static_cast(input_data.dynamic_batch_size)) { + batch_label = kBatchLabel + std::to_string(i); + inputs.batch_label = batch_label; + break; + } else if (batch_info[i].size() == kDynamicImageSizeVecSize && + batch_info[i][0] == static_cast(input_data.dynamic_image_height) && + batch_info[i][1] == static_cast(input_data.dynamic_image_width)) { + batch_label = kBatchLabel + std::to_string(i); + inputs.batch_label = batch_label; + break; + } + } + GELOGI("current batch label:%s", batch_label.c_str()); +} + +bool IsDynamicBatchSizeMatchModel(uint64_t batch_size, const vector> &batch_info) { + if (batch_info.empty()) { + GELOGE(ge::FAILED, "Dynamic batch info is empty."); + return false; + } + + for (auto batch : batch_info) { + if (batch.size() != kDynamicBatchSizeVecSize) { + GELOGE(ge::FAILED, "Dynamic batch param num is %zu, current batch size is %zu.", kDynamicBatchSizeVecSize, + batch.size()); + return false; + } + if (batch[0] == static_cast(batch_size)) { + return true; + } + } + GELOGE(ge::FAILED, "Dynamic batch %lu can not match the gear of model.", batch_size); + return false; +} + +bool IsDynamicImageSizeMatchModel(uint64_t image_height, uint64_t image_width, + const vector> &batch_info) { + if (batch_info.empty()) { + GELOGE(ge::FAILED, "Dynamic batch info is empty."); + return false; } + + for (auto resolution : batch_info) { + if (resolution.size() != kDynamicImageSizeVecSize) { + GELOGE(ge::FAILED, "Dynamic resolution param num is %zu, current resolution size is %zu.", + kDynamicImageSizeVecSize, resolution.size()); + return false; + } + if (resolution[0] == static_cast(image_height) && resolution[1] == static_cast(image_width)) { + return true; + } + } + + GELOGE(ge::FAILED, "Dynamic resolution (%lu,%lu) can not match the gear of model.", image_height, image_width); + return false; } +} // namespace + +namespace ge { +bool GeExecutor::isInit_ = false; +class ModelListenerAdapter : public ModelListener { + public: + domi::Status OnComputeDone(uint32_t model_id, uint32_t dataIndex, uint32_t resultCode) { + if (listener == nullptr) { + GELOGE(ge::FAILED, "listener is null."); + return FAILED; + } + return listener->OnComputeDone(model_id, dataIndex, resultCode); + } + + std::shared_ptr listener; +}; GeExecutor::GeExecutor() {} Status GeExecutor::Initialize() { GELOGI("Init ge_executor begin."); - if (is_init_) { + if (isInit_) { GELOGW("Already inited, don't need to init again."); return ge::SUCCESS; } @@ -130,38 +193,160 @@ Status GeExecutor::Initialize() { } // Start profiling - int32_t device_id = 0; - rtError_t rt_ret = rtGetDevice(&device_id); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "runtime get device_id failed, current device_id:%d", device_id); - return FAILED; - } - GELOGI("current device_id:%d", device_id); Options profiling_options; - profiling_options.device_id = device_id; + profiling_options.device_id = 0; profiling_options.job_id = ""; ProfilingManager::Instance().Init(profiling_options); - if (ProfilingManager::Instance().Init(profiling_options) != SUCCESS) { - GELOGE(FAILED, "Failed to init profiling."); - return FAILED; - } - is_init_ = true; + isInit_ = true; GELOGI("Init ge_executor over."); return ge::SUCCESS; } +Status GeExecutor::Finalize() { + GELOGI("Uninit ge_executor begin."); + if (isInit_ == false) { + GELOGW("ge_executor needs to init begin."); + return ge::SUCCESS; + } + + // Stop profiling + ProfilingManager::Instance().StopProfiling(); + GELOGI("Uninit ge_executor over."); + return ge::SUCCESS; +} + +Status GeExecutor::SetDynamicBatchSize(uint32_t model_id, void *dynamic_input_addr, uint64_t length, + uint64_t batch_size) { + if (dynamic_input_addr == nullptr) { + GELOGE(FAILED, "Dynamic input addr is nullptr!"); + return FAILED; + } + + uint64_t size = sizeof(uint64_t); + if (length < size) { + GELOGE(FAILED, "Dynamic input size [%lu] is less than [%lu]!", length, size); + return FAILED; + } + + // Verify whether the input dynamic batch matches the model gear + std::vector> batch_info; + Status ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info); + if (ret != SUCCESS) { + GELOGE(FAILED, "Get dynamic input info failed."); + return FAILED; + } + + if (!IsDynamicBatchSizeMatchModel(batch_size, batch_info)) { + GELOGE(FAILED, "The current dynamic input does not match the gear of the model."); + return FAILED; + } + + // memcpy dynamic_batch_size from host to device + if (rtMemcpy(dynamic_input_addr, length, &batch_size, size, RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) { + GELOGE(FAILED, "memcpy dynamic batch input data failed!"); + return FAILED; + } + return SUCCESS; +} + +Status GeExecutor::SetDynamicImageSize(uint32_t model_id, void *dynamic_input_addr, uint64_t length, + uint64_t image_height, uint64_t image_width) { + if (dynamic_input_addr == nullptr) { + GELOGE(FAILED, "Dynamic input addr is nullptr!"); + return FAILED; + } + + uint64_t dynamic_input_size = kDynamicImageSizeInputSize * sizeof(uint64_t); + if (length < dynamic_input_size) { + GELOGE(FAILED, "Dynamic input size [%lu] is less than [%lu]!", length, dynamic_input_size); + return FAILED; + } + + // Verify whether the input dynamic resolution matches the model gear + std::vector> batch_info; + Status ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info); + if (ret != SUCCESS) { + GELOGE(FAILED, "Get dynamic input info failed."); + return FAILED; + } + + if (!IsDynamicImageSizeMatchModel(image_height, image_width, batch_info)) { + GELOGE(FAILED, "The current dynamic input does not match the gear of the model."); + return FAILED; + } + + // Memcpy dynamic resolution height from host to device + if (rtMemcpy(dynamic_input_addr, sizeof(uint64_t), &image_height, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE) != + RT_ERROR_NONE) { + GELOGE(FAILED, "memcpy dynamic resolution input data failed!"); + return FAILED; + } + + uint64_t remain_size = length - sizeof(uint64_t); + // Memcpy dynamic resolution width from host to device + if (rtMemcpy(reinterpret_cast(reinterpret_cast(dynamic_input_addr) + sizeof(uint64_t)), + remain_size, &image_width, sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) { + GELOGE(FAILED, "memcpy dynamic resolution input data failed!"); + return FAILED; + } + return SUCCESS; +} + +Status GeExecutor::SetDynamicAippData(uint32_t model_id, void *dynamic_input_addr, uint64_t length, + const std::vector &aippBatchPara, + const kAippDynamicPara &aippParms) { + GELOGI("Enter to SetDynamicAippData."); + if (dynamic_input_addr == nullptr) { + GELOGE(FAILED, "Dynamic aipp input addr is nullptr!"); + return FAILED; + } + if (aippBatchPara.empty()) { + GELOGE(FAILED, "aippBatchPara is empty."); + return FAILED; + } + uint64_t batch_num = aippBatchPara.size(); + uint64_t real_aippParms_size = sizeof(kAippDynamicPara) - sizeof(kAippDynamicBatchPara); + uint64_t struct_len = batch_num * sizeof(kAippDynamicBatchPara) + real_aippParms_size; + GELOGI( + "Get acl input dynamic aipp data, model_id is %u, length is %lu," + "batch num is %lu, struct_len is %lu", + model_id, length, batch_num, struct_len); + if (struct_len > length) { + GELOGE(FAILED, "input dynamic aipp param len [%lu] is larger than aipp_data size [%lu]", struct_len, length); + return FAILED; + } + // Memcpy real kAippDynamicBatchPara from host to device + if (rtMemcpy(dynamic_input_addr, length, &aippParms, real_aippParms_size, RT_MEMCPY_HOST_TO_DEVICE) != + RT_ERROR_NONE) { + GELOGE(FAILED, "memcpy real_aippParms_size failed!"); + return FAILED; + } + uint64_t remain_len = length - real_aippParms_size; + uint8_t *aipp_batch_para_dev = reinterpret_cast(dynamic_input_addr) + real_aippParms_size; + + for (uint64_t i = 0; i < batch_num; ++i) { + if (rtMemcpy(reinterpret_cast(aipp_batch_para_dev + i * sizeof(kAippDynamicBatchPara)), + (remain_len - i * sizeof(kAippDynamicBatchPara)), &(aippBatchPara[i]), sizeof(kAippDynamicBatchPara), + RT_MEMCPY_HOST_TO_DEVICE) != RT_ERROR_NONE) { + GELOGE(FAILED, "memcpy kAippDynamicBatchPara input data failed!"); + return FAILED; + } + } + return SUCCESS; +} + // Load model Status GeExecutor::LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key, int32_t priority, std::shared_ptr listener) { GELOGI("load model offline begin."); - if (!is_init_) { + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } - string file_path = RealPath(path.c_str()); - if (file_path.empty()) { + string filePath = RealPath(path.c_str()); + if (filePath.empty()) { GELOGE(ge::FAILED, "fileath is invalid. please check your text file '%s'.", path.c_str()); return ge::FAILED; } @@ -183,13 +368,12 @@ Status GeExecutor::LoadModelOffline(uint32_t &model_id, const std::string &path, Status GeExecutor::LoadModel(uint32_t &model_id, const ModelData &model_data, std::shared_ptr listener) { - GELOGI("Load model begin, model_id:%u.", model_id); - if (!is_init_) { + GELOGI("Load model begin."); + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } - Status ret; std::shared_ptr listener_adapter = MakeShared(); if (listener_adapter == nullptr) { GELOGE(MEMALLOC_FAILED, "ModelListenerAdapter make shared failed!"); @@ -197,7 +381,7 @@ Status GeExecutor::LoadModel(uint32_t &model_id, const ModelData &model_data, } listener_adapter->listener = listener; - ret = GraphLoader::LoadModel(model_data, listener_adapter, model_id); + Status ret = GraphLoader::LoadModel(model_data, listener_adapter, model_id); if (ret != SUCCESS) { GELOGE(ret, "[GeExecutor] LoadModel failed."); return TransferDomiErrorCode(ret); @@ -207,21 +391,17 @@ Status GeExecutor::LoadModel(uint32_t &model_id, const ModelData &model_data, Status GeExecutor::UnloadModel(uint32_t model_id) { GELOGI("unload model %u begin.", model_id); - if (!is_init_) { + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } - // Stop profiling - if (!ProfilingManager::Instance().ProfilingOpTraceOn() && ProfilingManager::Instance().ProfilingOn()) { - ProfilingManager::Instance().StopProfiling(); - } return GraphLoader::UnloadModel(model_id); } Status GeExecutor::RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data) { GELOGI("run model begin."); - if (!is_init_) { + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } @@ -238,7 +418,7 @@ Status GeExecutor::RunModel(const ge::RunModelData &input_data, ge::RunModelData Status GeExecutor::GetModelDescInfo(uint32_t model_id, std::vector &input_desc, std::vector &output_desc) { GELOGI("get model desc info begin."); - if (!is_init_) { + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } @@ -274,10 +454,34 @@ Status GeExecutor::GetModelDescInfo(uint32_t model_id, std::vector> &batch_info) { + GELOGI("Begin to get dynamic batch info."); + if (!isInit_) { + GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); + return GE_EXEC_NOT_INIT; + } + + Status ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info); + if (ret != SUCCESS) { + GELOGE(ret, "GetDynamicBatchInfo failed."); + return ret; + } + + GELOGI("Get dynamic batch info succ."); + return SUCCESS; +} + Status GeExecutor::GetModelDescInfoForZeroCopy(uint32_t model_id, std::vector &input_desc, std::vector &output_desc) { GELOGI("get model desc info for zero copy begin."); - if (!is_init_) { + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } @@ -314,6 +518,7 @@ Status GeExecutor::GetModelDescInfoForZeroCopy(uint32_t model_id, std::vector(max_mem_size); return ret; } -/// -/// @ingroup ge -/// @brief Load data from model file to memory -/// @param [in] const std::string &path: Offline model file path -/// @param [out] domi::ModelData &model_data: Offline model memory data -/// @return SUCCESS handle successfully / others handle failed -/// +/** + * @ingroup ge + * @brief Load data from model file to memory + * @param [in] const std::string &path: Offline model file path + * @param [out] domi::ModelData &model_data: Offline model memory data + * @return SUCCESS handle successfully / others handle failed + */ Status GeExecutor::LoadDataFromFile(const std::string &path, ModelData &model_data) { - string file_path = RealPath(path.c_str()); - if (file_path.empty()) { - GELOGE(ge::FAILED, "file_path is invalid. please check your text file '%s'.", path.c_str()); + GELOGI("Load data from file begin."); + if (!isInit_) { + GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); + return GE_EXEC_NOT_INIT; + } + + string filePath = RealPath(path.c_str()); + if (filePath.empty()) { + GELOGE(ge::FAILED, "filePath is invalid. please check your text file '%s'.", path.c_str()); return ge::FAILED; } - GELOGI("load model_data from file: %s.", path.c_str()); + GELOGI("load modelData from file: %s.", path.c_str()); std::string key_path; int32_t priority = 0; Status ret = GraphLoader::LoadDataFromFile(path, key_path, priority, model_data); @@ -356,71 +573,102 @@ Status GeExecutor::LoadDataFromFile(const std::string &path, ModelData &model_da return ret; } -/// -/// @ingroup ge -/// @brief Load model from offline model memory data -/// @param [in] domi::ModelData &model_data: Offline model data -/// void *dev_ptr: Input/Output memory start address -/// size_t memsize: Input/Output memory length -/// void *weight_ptr: Weight memory start address -/// size_t weightsize: Weight memory length -/// @param [out] uint32_t &model_id: identification after model loading -/// @return SUCCESS handle successfully / others handle failed -/// +/** +* @ingroup ge +* @brief Load model from offline model memory data +* @param [in] domi::ModelData &model_data: Offline model data + void *dev_ptr: Input/Output memory start address + size_t memsize: Input/Output memory length + void *weight_ptr: Weight memory start address + size_t weightsize: Weight memory length +* @param [out] uint32_t &model_id: identification after model loading +* @return SUCCESS handle successfully / others handle failed +*/ Status GeExecutor::LoadModelFromData(uint32_t &model_id, const ModelData &model_data, void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) { + GELOGI("Load model from data begin."); + if (!isInit_) { + GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); + return GE_EXEC_NOT_INIT; + } + return GraphLoader::LoadModelFromData(model_id, model_data, dev_ptr, mem_size, weight_ptr, weight_size); } -/// -/// @ingroup ge -/// @brief Load task list from ModelData with queue. -/// @param [out] model_id: model id allocate from manager. -/// @param [in] ge_model_data: Model data load from offline model. -/// @param [in] input_queue_ids: input queue ids create from user. -/// @param [in] output_queue_ids: input queue ids create from user. -/// @return: 0 for success / others for fail -/// +/** + * @ingroup ge + * @brief Load task list from ModelData with queue. + * @param [out] model_id: model id allocate from manager. + * @param [in] ge_model_data: Model data load from offline model. + * @param [in] input_queue_ids: input queue ids create from user. + * @param [in] output_queue_ids: input queue ids create from user. + * @return: 0 for success / others for fail + */ Status GeExecutor::LoadModelWithQ(uint32_t &model_id, const ModelData &model_data, const std::vector &input_queue_ids, const std::vector &output_queue_ids) { + GELOGI("Load model with queue begin."); + if (!isInit_) { + GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); + return GE_EXEC_NOT_INIT; + } return GraphLoader::LoadModelWithQ(model_id, model_data, input_queue_ids, output_queue_ids); } -/// -/// @ingroup ge -/// @brief Synchronous execution of offline model(Do not create thread) -/// @param [in] uint32_t modelId: Model ID to execute -/// void* stream: stream to execute -/// const domi::InputData *input_data: Model input data -/// bool async_mode: is asynchronize mode. -/// @param [out] domi::OutputData *output_data: Model output data -/// @return SUCCESS handle successfully / others handle failed -/// -Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModelData &input_data, - ge::RunModelData &output_data, bool async_mode) { - if (!is_init_) { +/** +* @ingroup ge +* @brief Synchronous execution of offline model(Do not create thread) +* @param [in] uint32_t model_id: Model ID to execute + void* stream: stream to execute + const domi::InputData *input_data: Model input data + bool async_mode: is asynchronize mode. +* @param [out] domi::OutputData *output_data: Model output data +* @return SUCCESS handle successfully / others handle failed +*/ +Status GeExecutor::ExecModel(uint32_t model_id, void *stream, const ge::RunModelData &run_input_data, + ge::RunModelData &run_output_data, bool async_mode) { + GELOGI("Execute model begin."); + if (!isInit_) { GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); return GE_EXEC_NOT_INIT; } - InputData input_data_tmp; - OutputData output_data_tmp; - GetDomiInputData(input_data, input_data_tmp); - GetDomiOutputData(output_data, output_data_tmp); + InputData input_data; + OutputData output_data; + GetDomiInputData(run_input_data, input_data); + GetDomiOutputData(run_output_data, output_data); + + if ((run_input_data.dynamic_batch_size != 0) || (run_input_data.dynamic_image_width != 0) || + (run_input_data.dynamic_image_height != 0)) { + std::vector> batch_info; + Status ret = GraphExecutor::GetDynamicBatchInfo(model_id, batch_info); + if (ret != SUCCESS) { + GELOGE(FAILED, "Get dynamic input info failed."); + return FAILED; + } + if (!batch_info.empty()) { + SetDynamicInputDataFlag(run_input_data, batch_info, input_data); + } + } - return GraphLoader::ExecuteModel(model_id, stream, async_mode, input_data_tmp, output_data_tmp); + return GraphLoader::ExecuteModel(model_id, stream, async_mode, input_data, output_data); } -/// -/// @ingroup ge -/// @brief Get weight memory size from model file -/// @param [in] const std::string &path: Offline model file path -/// @param [out] size_t &mem_size Execution memory size -/// size_t &weight_size Weight memory space size -/// @return SUCCESS handle successfully / others handle failed -/// +/** +* @ingroup ge +* @brief Get weight memory size from model file +* @param [in] const std::string &path: Offline model file path +* @param [out] size_t &mem_size Execution memory size + size_t &weight_size Weight memory space size +* @return SUCCESS handle successfully / others handle failed +*/ Status GeExecutor::GetMemAndWeightSize(const std::string &path, size_t &mem_size, size_t &weight_size) { + GELOGI("Get memory and weight size from file begin."); + if (!isInit_) { + GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); + return GE_EXEC_NOT_INIT; + } + ModelData model; std::string key; Status ret = ge::GraphLoader::LoadDataFromFile(path, key, 0, model); @@ -437,17 +685,23 @@ Status GeExecutor::GetMemAndWeightSize(const std::string &path, size_t &mem_size return ret; } -/// -/// @ingroup ge -/// @brief Get weight memory size from model file -/// @param [in] const void *model_data Offline model buffer -/// size_t model_size Offline model buffer length -/// @param [out] size_t &mem_size Execution memory size -/// size_t &weight_size Weight memory space size -/// @return SUCCESS handle successfully / others handle failed -/// +/** +* @ingroup ge +* @brief Get weight memory size from model file +* @param [in] const void *model_data Offline model buffer + size_t model_size Offline model buffer length +* @param [out] size_t &mem_size Execution memory size + size_t &weight_size Weight memory space size +* @return SUCCESS handle successfully / others handle failed +*/ Status GeExecutor::GetMemAndWeightSize(const void *model_data, size_t model_size, size_t &mem_size, size_t &weight_size) { + GELOGI("Get memory and weight size from data begin."); + if (!isInit_) { + GELOGE(GE_EXEC_NOT_INIT, "not inited yet!"); + return GE_EXEC_NOT_INIT; + } + if (model_data == nullptr) { GELOGE(PARAM_INVALID, "invalid model data!"); return PARAM_INVALID; @@ -460,9 +714,9 @@ Status GeExecutor::GetMemAndWeightSize(const void *model_data, size_t model_size return ge::ModelManager::GetModelMemAndWeightSize(model, mem_size, weight_size); } -Status GeExecutor::LoadSingleOp(const std::string &model_name, const ge::ModelData &model_data, void *stream, +Status GeExecutor::LoadSingleOp(const std::string &model_name, const ge::ModelData &modelData, void *stream, SingleOp **single_op) { - return SingleOpManager::GetInstance().GetOpFromModel(model_name, model_data, stream, single_op); + return SingleOpManager::GetInstance().GetOpFromModel(model_name, modelData, stream, single_op); } Status GeExecutor::ExecuteAsync(SingleOp *executor, const std::vector &inputs, diff --git a/src/ge/ge_local_engine/CMakeLists.txt b/src/ge/ge_local_engine/CMakeLists.txt index 1db26782..559c782d 100755 --- a/src/ge/ge_local_engine/CMakeLists.txt +++ b/src/ge/ge_local_engine/CMakeLists.txt @@ -20,7 +20,7 @@ file(GLOB_RECURSE PROTO_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} ) file(GLOB_RECURSE SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} - "engine/*.cc" + "engine/ge_local_engine.cc" "ops_kernel_store/*.cc" ) diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.cc b/src/ge/ge_local_engine/engine/host_cpu_engine.cc new file mode 100644 index 00000000..9ee616ac --- /dev/null +++ b/src/ge/ge_local_engine/engine/host_cpu_engine.cc @@ -0,0 +1,249 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "host_cpu_engine.h" +#include +#include "graph/common/omg_util.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_adapter.h" +#include "mmpa/mmpa_api.h" +#include "register/op_kernel_registry.h" +#include "common/ge/ge_util.h" +#include "common/ge/plugin_manager.h" + +namespace ge { +namespace { +const char *kEnvKeyOppPath = "ASCEND_OPP_PATH"; +const char *kHostCpuLibRelativePath = "/op_impl/built-in/host_cpu"; +} // namespace + +void HostCpuEngine::CloseSo() { + for (auto handle : lib_handles_) { + if (dlclose(handle) != 0) { + GELOGW("failed to close handle, message: %s", dlerror()); + } + } + lib_handles_.clear(); +} + +ge::Status HostCpuEngine::Initialize() { + std::lock_guard lock(mu_); + if (initialized_) { + GELOGI("HostCpuEngine is already initialized"); + return SUCCESS; + } + std::string lib_dir; + GE_CHK_STATUS_RET_NOLOG(GetLibPath(lib_dir)); + + std::vector so_paths; + if (ListSoFiles(lib_dir, so_paths) == SUCCESS) { + (void)LoadLibs(so_paths); + } + + initialized_ = true; + return SUCCESS; +} + +void HostCpuEngine::Finalize() { GELOGI("start HostCpuEngine::Finalize"); } + +bool HostCpuEngine::CheckSupported(const string &op_type) { + return OpKernelRegistry::GetInstance().IsRegistered(op_type); +} + +Status HostCpuEngine::FindOpKernel(const ge::NodePtr &node, std::unique_ptr &op_kernel) { + std::string op_type; + auto status = GetOriginalType(node, op_type); + GE_CHK_BOOL_EXEC_NOLOG(status == SUCCESS, return status); + + auto kernel = OpKernelRegistry::GetInstance().CreateHostCpuOp(op_type); + if (kernel == nullptr) { + GELOGD("Op of type %s is not supported by host cpu engine", op_type.c_str()); + return UNSUPPORTED; + } + + GELOGD("Successfully created op kernel. op type = %s", op_type.c_str()); + op_kernel = std::move(kernel); + return SUCCESS; +} + +Status HostCpuEngine::PrepareInputs(const ge::ConstOpDescPtr &op_desc, const vector &inputs, + map &named_inputs) { + auto num_inputs = op_desc->GetInputsSize(); + if (num_inputs != inputs.size()) { + GELOGE(PARAM_INVALID, "Mismatching input sizes. op_desc has %zu input(s), but given %zu", num_inputs, + inputs.size()); + return PARAM_INVALID; + } + + for (size_t i = 0; i < num_inputs; ++i) { + auto ge_tensor = inputs[i]; + GE_CHECK_NOTNULL(ge_tensor); + auto tensor = TensorAdapter::AsTensor(*ge_tensor); + auto tensor_name = op_desc->GetInputNameByIndex(i); + GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get input name. node = %s, index = %zu", + op_desc->GetName().c_str(), i); + GELOGD("Successfully inserted input tensor. node = %s, index = %zu, input name = %s", op_desc->GetName().c_str(), i, + tensor_name.c_str()); + named_inputs.emplace(tensor_name, tensor); + } + + return SUCCESS; +} + +Status HostCpuEngine::PrepareOutputs(const ge::ConstOpDescPtr &op_desc, vector &outputs, + map &named_outputs) { + for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { + auto ge_tensor = MakeShared(op_desc->GetOutputDesc(i)); + GE_CHECK_NOTNULL(ge_tensor); + outputs.emplace_back(ge_tensor); + auto tensor = TensorAdapter::AsTensor(*ge_tensor); + auto tensor_name = op_desc->GetOutputNameByIndex(i); + GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu", + op_desc->GetName().c_str(), i); + GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s", op_desc->GetName().c_str(), + i, tensor_name.c_str()); + named_outputs.emplace(tensor_name, tensor); + } + + return SUCCESS; +} + +Status HostCpuEngine::RunInternal(const ge::OpDescPtr &op_desc, HostCpuOp &op_kernel, + map &named_inputs, + map &named_outputs) { + GELOGD("To run host cpu op: %s", op_desc->GetName().c_str()); + Operator op = ge::OpDescUtils::CreateOperatorFromOpDesc(op_desc); + auto ret = op_kernel.Compute(op, named_inputs, named_outputs); + if (ret != GRAPH_SUCCESS) { + GELOGE(FAILED, "Failed to compute host cpu op. node = %s, ret = %u", op_desc->GetName().c_str(), ret); + return FAILED; + } + + return SUCCESS; +} + +Status HostCpuEngine::Run(NodePtr &node, const vector &inputs, std::vector &outputs) { + GE_CHECK_NOTNULL(node); + GE_CHECK_NOTNULL(node->GetOpDesc()); + + GELOGD("To run node by host cpu engine. node name = %s", node->GetName().c_str()); + std::unique_ptr op_kernel; + GE_CHK_STATUS_RET_NOLOG(FindOpKernel(node, op_kernel)); + + std::map named_inputs; + std::vector tmp_outputs; + std::map named_outputs; + auto op_desc = node->GetOpDesc(); + GE_CHK_STATUS_RET_NOLOG(PrepareInputs(op_desc, inputs, named_inputs)); + GE_CHK_STATUS_RET_NOLOG(PrepareOutputs(op_desc, tmp_outputs, named_outputs)); + GE_CHK_STATUS_RET_NOLOG(RunInternal(op_desc, *op_kernel, named_inputs, named_outputs)); + + GELOGD("Ran node by host cpu engine successfully. name node = %s", node->GetName().c_str()); + outputs.swap(tmp_outputs); + return SUCCESS; +} + +ge::Status HostCpuEngine::GetLibPath(std::string &lib_path) { + GELOGI("Start to get host cpu lib path"); + const char *path_env = std::getenv(kEnvKeyOppPath); + if (path_env != nullptr) { + lib_path = path_env; + if (!lib_path.empty()) { + lib_path += kHostCpuLibRelativePath; + GELOGI("Get host cpu so path from env: %s", lib_path.c_str()); + return SUCCESS; + } + } + + lib_path = PluginManager::GetPath(); + GELOGI("path_base is %s", lib_path.c_str()); + lib_path = lib_path.substr(0, lib_path.rfind('/')); + lib_path = lib_path.substr(0, lib_path.rfind('/')); + lib_path += "/opp"; + lib_path += kHostCpuLibRelativePath; + + GELOGI("Get host cpu so path from PluginManager::GetPath: %s", lib_path.c_str()); + return SUCCESS; +} + +static int RegularFileFilterFn(const mmDirent *entry) { return entry->d_type == DT_REG; } + +Status HostCpuEngine::ListSoFiles(const std::string &base_dir, std::vector &names) { + std::string real_path = base_dir; + GE_CHK_STATUS_RET_NOLOG(GetRealPath(real_path)); + real_path.push_back('/'); + mmDirent **entries = nullptr; + auto ret = mmScandir(real_path.c_str(), &entries, RegularFileFilterFn, nullptr); + if (ret < 0) { + GELOGW("scan dir failed. path = %s, ret = %d", real_path.c_str(), ret); + return INTERNAL_ERROR; + } + + for (int i = 0; i < ret; ++i) { + mmDirent *dir_ent = entries[i]; + string name = string(dir_ent->d_name); + if (IsSoFile(name)) { + names.emplace_back(real_path + name); + } + } + + mmScandirFree(entries, ret); + GELOGI("Found %d libs to load", ret); + return SUCCESS; +} + +bool HostCpuEngine::IsSoFile(const std::string &file_name) { + static const std::string so_suffix(".so"); + auto pos = file_name.rfind(so_suffix); + if (pos == string::npos) { + return false; + } + + return pos == file_name.size() - so_suffix.size(); +} + +Status HostCpuEngine::LoadLibs(std::vector &lib_paths) { + for (auto &so_path : lib_paths) { + GE_CHK_STATUS_RET_NOLOG(GetRealPath(so_path)); + GE_CHK_STATUS_RET_NOLOG(LoadLib(so_path)); + } + + return SUCCESS; +} + +Status HostCpuEngine::LoadLib(const std::string &lib_path) { + GELOGI("To invoke dlopen on lib: %s", lib_path.c_str()); + auto handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (handle == nullptr) { + GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), dlerror()); + return INTERNAL_ERROR; + } + + lib_handles_.emplace_back(handle); + return SUCCESS; +} + +Status HostCpuEngine::GetRealPath(std::string &path) { + std::string real_path = RealPath(path.c_str()); + if (real_path.empty()) { + GELOGW("File path %s is invalid.", path.c_str()); + return INTERNAL_ERROR; + } + + path = real_path; + return SUCCESS; +} +} // namespace ge \ No newline at end of file diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.h b/src/ge/ge_local_engine/engine/host_cpu_engine.h new file mode 100644 index 00000000..88985f87 --- /dev/null +++ b/src/ge/ge_local_engine/engine/host_cpu_engine.h @@ -0,0 +1,78 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GE_LOCAL_ENGINE_ENGINE_HOST_CPU_ENGINE_H_ +#define GE_GE_LOCAL_ENGINE_ENGINE_HOST_CPU_ENGINE_H_ + +#include +#include "framework/common/ge_inner_error_codes.h" +#include "graph/node.h" +#include "graph/operator.h" +#include "register/register.h" + +namespace ge { +class HostCpuEngine { + public: + ~HostCpuEngine() = default; + + static HostCpuEngine &GetInstance() { + static HostCpuEngine instance; + return instance; + } + + ge::Status Initialize(); + + void Finalize(); + + static bool CheckSupported(const string &op_type); + + ge::Status Run(NodePtr &node, const vector &inputs, std::vector &outputs); + + private: + HostCpuEngine() = default; + + void CloseSo(); + + ge::Status LoadLibs(std::vector &lib_paths); + + ge::Status LoadLib(const std::string &lib_path); + + static ge::Status GetRealPath(std::string &path); + + static ge::Status GetLibPath(std::string &lib_path); + + static ge::Status ListSoFiles(const std::string &base_dir, std::vector &names); + + static bool IsSoFile(const std::string &file_name); + + static ge::Status FindOpKernel(const NodePtr &node, std::unique_ptr &op_kernel); + + static ge::Status PrepareInputs(const ConstOpDescPtr &op_desc, const vector &inputs, + std::map &named_inputs); + + static ge::Status PrepareOutputs(const ConstOpDescPtr &op_desc, vector &outputs, + std::map &named_outputs); + + static ge::Status RunInternal(const OpDescPtr &op_desc, HostCpuOp &op_kernel, + std::map &named_inputs, + std::map &named_outputs); + + std::mutex mu_; + std::vector lib_handles_; + bool initialized_ = false; +}; +} // namespace ge +#endif // GE_GE_LOCAL_ENGINE_ENGINE_HOST_CPU_ENGINE_H_ diff --git a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc index 18819125..cde6640f 100644 --- a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc +++ b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc @@ -17,17 +17,18 @@ #include "ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.h" #include #include "common/constant/constant.h" -#include "framework/common/debug/ge_log.h" -#include "common/ge_inner_error_codes.h" #include "common/ge/ge_util.h" +#include "common/ge_inner_error_codes.h" +#include "framework/common/debug/ge_log.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "op/op_factory.h" #include "proto/task.pb.h" namespace { -const char *kConstantOpType = "Constant"; -const char *kConstantOpAttrName = "value"; +const char *const kConstantOpType = "Constant"; +const char *const kConstantOpAttrName = "value"; +const char *const kDataOpType = "Data"; } // namespace namespace ge { namespace ge_local { @@ -77,11 +78,11 @@ Status GeLocalOpsKernelInfoStore::CalcOpRunningParam(Node &ge_node) { Format format = output_tensor.GetFormat(); DataType data_type = output_tensor.GetDataType(); - uint32_t mem_size = 0; + int64_t mem_size = 0; graphStatus graph_status = TensorUtils::GetSize(output_tensor, mem_size); // If mem size has been set, no need reset. if ((graph_status == GRAPH_SUCCESS) && (mem_size > 0) && (data_type != DT_STRING)) { - GELOGD("Op[%s:%s] out[%zu] mem size has been set, no need calc again, format=%s, data_type=%s, mem_size=%u.", + GELOGD("Op[%s:%s] out[%zu] mem size has been set, no need calc again, format=%s, data_type=%s, mem_size=%ld.", node_name.c_str(), node_type.c_str(), i, TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str(), mem_size); continue; @@ -91,6 +92,10 @@ Status GeLocalOpsKernelInfoStore::CalcOpRunningParam(Node &ge_node) { GeShape output_shape = output_tensor.GetShape(); if ((node_type == kConstantOpType) && (data_type == DT_STRING)) { graph_status = CalcConstantStrMemSize(op_desc, output_mem_size); + } else if (node_type == kDataOpType) { + int64_t output_size = 0; + graph_status = TensorUtils::GetTensorMemorySizeInBytes(output_tensor, output_size); + output_mem_size = output_size; } else { graph_status = TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size); } @@ -116,14 +121,7 @@ Status GeLocalOpsKernelInfoStore::CalcOpRunningParam(Node &ge_node) { node_name.c_str(), node_type.c_str(), i, output_mem_size, TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); - if (output_mem_size > static_cast(UINT_MAX)) { - GELOGE(FAILED, - "Calc op[%s:%s] out[%zu] mem size failed, as GE need data, " - "type is uint32, but output_mem_size[%ld] is overflow.", - node_name.c_str(), node_type.c_str(), i, output_mem_size); - return FAILED; - } - TensorUtils::SetSize(output_tensor, static_cast(output_mem_size)); + TensorUtils::SetSize(output_tensor, output_mem_size); graph_status = op_desc->UpdateOutputDesc(static_cast(i), output_tensor); if (graph_status != GRAPH_SUCCESS) { @@ -174,7 +172,7 @@ Status GeLocalOpsKernelInfoStore::GenerateTask(const Node &node, RunContext &con GELOGE(ret, "Node:%s(%s) op run failed.", name.c_str(), type.c_str()); return ret; } - GELOGD("Ge local generate task for node:%s(%s) end, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size()); + GELOGI("Ge local generate task for node:%s(%s) end, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size()); return ret; } diff --git a/src/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc b/src/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc index 8c68abbd..6a327bb8 100644 --- a/src/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc +++ b/src/ge/ge_local_engine/ops_kernel_store/op/ge_deleted_op.cc @@ -58,6 +58,7 @@ REGISTER_OP_CREATOR(Placeholder, GeDeletedOp); REGISTER_OP_CREATOR(End, GeDeletedOp); REGISTER_OP_CREATOR(Merge, GeDeletedOp); REGISTER_OP_CREATOR(Switch, GeDeletedOp); +REGISTER_OP_CREATOR(SwitchN, GeDeletedOp); REGISTER_OP_CREATOR(RefMerge, GeDeletedOp); REGISTER_OP_CREATOR(RefSwitch, GeDeletedOp); } // namespace ge_local diff --git a/src/ge/ge_local_engine/ops_kernel_store/op/no_op.cc b/src/ge/ge_local_engine/ops_kernel_store/op/no_op.cc index 5bbec472..58777e53 100644 --- a/src/ge/ge_local_engine/ops_kernel_store/op/no_op.cc +++ b/src/ge/ge_local_engine/ops_kernel_store/op/no_op.cc @@ -24,6 +24,7 @@ namespace ge_local { NoOp::NoOp(const Node &node, RunContext &run_context) : Op(node, run_context) {} Status NoOp::Run() { + GELOGI("Node:%s type is %s, no need gen task.", name_.c_str(), type_.c_str()); // Do nothing return SUCCESS; } @@ -43,5 +44,17 @@ REGISTER_OP_CREATOR(Const, NoOp); REGISTER_OP_CREATOR(NetOutput, NoOp); REGISTER_OP_CREATOR(ControlTrigger, NoOp); + +// Functional Op. +REGISTER_OP_CREATOR(If, NoOp); +REGISTER_OP_CREATOR(_If, NoOp); +REGISTER_OP_CREATOR(StatelessIf, NoOp); +REGISTER_OP_CREATOR(Case, NoOp); +REGISTER_OP_CREATOR(While, NoOp); +REGISTER_OP_CREATOR(_While, NoOp); +REGISTER_OP_CREATOR(StatelessWhile, NoOp); +REGISTER_OP_CREATOR(For, NoOp); +REGISTER_OP_CREATOR(PartitionedCall, NoOp); +REGISTER_OP_CREATOR(StatefulPartitionedCall, NoOp); } // namespace ge_local } // namespace ge diff --git a/src/ge/ge_runtime/runtime_model.cc b/src/ge/ge_runtime/runtime_model.cc index 5573fa89..ffb0d8a0 100644 --- a/src/ge/ge_runtime/runtime_model.cc +++ b/src/ge/ge_runtime/runtime_model.cc @@ -208,6 +208,10 @@ bool RuntimeModel::LoadTask() { } task_id_list_.push_back(task_id); } + if (task_list_.empty()) { + GELOGE(FAILED, "Task list is empty"); + return false; + } GELOGI("Distribute task succ."); auto rt_ret = rtModelLoadComplete(rt_model_handle_); diff --git a/src/ge/ge_runtime/task/aicpu_task.cc b/src/ge/ge_runtime/task/aicpu_task.cc index 07f287e1..4cb71866 100644 --- a/src/ge/ge_runtime/task/aicpu_task.cc +++ b/src/ge/ge_runtime/task/aicpu_task.cc @@ -58,6 +58,7 @@ bool AicpuTask::Distribute() { GELOGE(RT_FAILED, "Call rt api(rtMalloc) failed, ret: 0x%X.", rt_ret); return false; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "task args data.", args_size) // Memcpy AicpuParamHead rt_ret = rtMemcpy(args_, sizeof(aicpu::AicpuParamHead), reinterpret_cast(&aicpu_param_head), sizeof(aicpu::AicpuParamHead), RT_MEMCPY_HOST_TO_DEVICE); diff --git a/src/ge/ge_runtime/task/cce_task.cc b/src/ge/ge_runtime/task/cce_task.cc index e2fef432..e5ea99c0 100644 --- a/src/ge/ge_runtime/task/cce_task.cc +++ b/src/ge/ge_runtime/task/cce_task.cc @@ -90,6 +90,7 @@ bool CceTask::Distribute() { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return false; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "task information.", task_info_->flow_table().size()) rt_ret = rtMemcpy(flowtable_, task_info_->flow_table().size(), task_info_->flow_table().data(), task_info_->flow_table().size(), RT_MEMCPY_HOST_TO_DEVICE); @@ -117,6 +118,7 @@ bool CceTask::Distribute() { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return false; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "task information.", task_info_->args_size()) rt_ret = rtMemcpy(args_, task_info_->args_size(), task_info_->args().data(), task_info_->args_size(), RT_MEMCPY_HOST_TO_DEVICE); diff --git a/src/ge/ge_runtime/task/tbe_task.cc b/src/ge/ge_runtime/task/tbe_task.cc index 88279f1e..19056c1b 100644 --- a/src/ge/ge_runtime/task/tbe_task.cc +++ b/src/ge/ge_runtime/task/tbe_task.cc @@ -85,6 +85,7 @@ bool TbeTask::Distribute() { GELOGE(RT_FAILED, "rtMalloc failed, ret: %d", static_cast(rt_ret)); return false; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "task args data.", args_size) rt_ret = rtMemcpy(args_, args_size, reinterpret_cast(tensor_device_addrs.data()), args_size, RT_MEMCPY_HOST_TO_DEVICE); diff --git a/src/ge/generator/ge_generator.cc b/src/ge/generator/ge_generator.cc index 728bc424..8cae441c 100644 --- a/src/ge/generator/ge_generator.cc +++ b/src/ge/generator/ge_generator.cc @@ -28,6 +28,7 @@ #include "graph/utils/graph_utils.h" #include "model/ge_model.h" +using ge::ModelBufferData; using std::map; using std::string; using std::vector; @@ -121,13 +122,14 @@ class GeGenerator::Impl { Status BuildModel(const Graph &graph, const vector &inputs, GraphId &graph_id, vector &ge_models); - Status SaveModel(const string &file_name_prefix, vector models); + Status SaveModel(const string &file_name_prefix, vector &models, ModelBufferData &model); Status SaveParams(GeModelPtr &ge_model, const string &type, const map &attrs, const vector &inputs, const vector &outputs); GraphManager graph_manager_; SaveParam save_param_; + bool is_offline_ = true; }; Status GeGenerator::Initialize(const map &options) { @@ -185,6 +187,16 @@ Status GeGenerator::Finalize() { Status GeGenerator::GenerateOfflineModel(const Graph &graph, const string &file_name_prefix, const vector &inputs) { GELOGI("Start to GenerateOfflineModel."); + ModelBufferData model; + return GenerateModel(graph, file_name_prefix, inputs, model, true); +} + +Status GeGenerator::GenerateOnlineModel(const Graph &graph, const vector &inputs, ModelBufferData &model) { + return GenerateModel(graph, "online", inputs, model, false); +} + +Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_prefix, const vector &inputs, + ModelBufferData &model, bool is_offline) { GraphId graph_id; vector ge_models; GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID); @@ -196,7 +208,7 @@ Status GeGenerator::GenerateOfflineModel(const Graph &graph, const string &file_ } else { model_name = compute_graph->GetName(); } - + impl_->is_offline_ = is_offline; Status ret = impl_->BuildModel(graph, inputs, graph_id, ge_models); if (ret != SUCCESS) { GELOGE(ret, "Build model failed"); @@ -209,8 +221,7 @@ Status GeGenerator::GenerateOfflineModel(const Graph &graph, const string &file_ if (!model_name.empty() && !ge_models.empty()) { ge_models[0]->SetName(model_name); } - - ret = impl_->SaveModel(file_name_prefix, ge_models); + ret = impl_->SaveModel(file_name_prefix, ge_models, model); if (ret != SUCCESS) { GELOGE(ret, "Save model failed"); if (impl_->graph_manager_.Finalize() != SUCCESS) { @@ -243,6 +254,9 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector op_attrs = op_desc->GetAllAttrs(); + // 1. Create ComputeGraph. string name = ge::CurrentTimeInStr() + "_" + model_file_name; ge::ComputeGraphPtr compute_graph = MakeShared(name); @@ -290,11 +304,10 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vectorBuildModel(graph, inputs, graph_id, ge_models)); if (!ge_models.empty()) { - map op_attrs = op_desc->GetAllAttrs(); GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_models[0], op_desc->GetType(), op_attrs, inputs, outputs)); } - - GE_CHK_STATUS_RET_NOLOG(impl_->SaveModel(model_file_name, ge_models)); + ModelBufferData model_buff; + GE_CHK_STATUS_RET_NOLOG(impl_->SaveModel(model_file_name, ge_models, model_buff)); return SUCCESS; } @@ -308,7 +321,8 @@ Status GeGenerator::Impl::SaveParams(GeModelPtr &ge_model, const string &type, c return SUCCESS; } -Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, vector models) { +Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, vector &models, + ModelBufferData &model_buff) { // to be change to ModelHelper interface if (models.empty()) { GELOGE(FAILED, "models are empty."); @@ -316,7 +330,8 @@ Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, vectorGetDirectNode()) { + for (const auto &node_ptr : graph->GetAllNodes()) { GE_CHECK_NOTNULL(node_ptr->GetOpDesc()); std::string kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName(); if (kernel_lib_name.empty()) { @@ -98,7 +98,6 @@ Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vectorGetAllSubgraphs()) { + GraphUtils::DumpGEGraphToOnnx(*graph, "SubgraphGetTask"); + } + GE_TIMESTAMP_START(GetTaskInfo); ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_ptr_list, session_id); GE_TIMESTAMP_END(GetTaskInfo, "GraphBuilder::GetTaskInfo"); @@ -186,8 +189,8 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr return ret; } - OptimizeStreamGraph optimize_stream; - ret = optimize_stream.OptimizeStreamedSubGraph(comp_graph, subgraph_ptr_list, run_context.GetRunContext()); + StreamGraphOptimizer stream_optimizer; + ret = stream_optimizer.OptimizeStreamedSubGraph(comp_graph, subgraph_ptr_list, run_context.GetRunContext()); if (ret != SUCCESS) { GELOGE(ret, "Optimize streamed subGraph fail."); return ret; @@ -208,6 +211,13 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) { // set input_desc.size = src_node.output_desc.size + if (node_ptr->GetType() == DATA) { + if (UpdateDataInputSize(node_ptr) != SUCCESS) { + GELOGE(FAILED, "Update data input size failed."); + return FAILED; + } + } + for (const auto &in_data_anchor : node_ptr->GetAllInDataAnchors()) { const auto &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); @@ -219,9 +229,9 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) { // set dst_node.input_desc = src_node.output_desc ge::GeTensorDesc desc_temp(src_op->GetOutputDesc(peer_out_anchor->GetIdx())); - uint32_t size = 0; + int64_t size = 0; GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(desc_temp, size) != SUCCESS, GELOGI("Get size failed!")); - GELOGD("src node %s output desc, dim_size: %zu, mem_size: %u, format: %s, type: %s.", src_node->GetName().c_str(), + GELOGD("src node %s output desc, dim_size: %zu, mem_size: %ld, format: %s, type: %s.", src_node->GetName().c_str(), desc_temp.GetShape().GetDimNum(), size, TypeUtils::FormatToSerialString(desc_temp.GetFormat()).c_str(), TypeUtils::DataTypeToSerialString(desc_temp.GetDataType()).c_str()); for (size_t i = 0; i < desc_temp.GetShape().GetDimNum(); ++i) { @@ -240,13 +250,56 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) { return SUCCESS; } -Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph, - std::vector &subgraph_ptr_list) { +Status GraphBuilder::UpdateDataInputSize(const ge::NodePtr &node_ptr) { + const auto &op_desc = node_ptr->GetOpDesc(); + if (op_desc == nullptr) { + GELOGE(FAILED, "Op desc is nullptr."); + return FAILED; + } + // data op only has one output anchor + ge::GeTensorDesc output_desc = op_desc->GetOutputDesc(0); + int64_t output_size = 0; + if (ge::TensorUtils::GetSize(output_desc, output_size) != SUCCESS) { + GELOGW("Get size failed!"); + } + + if (output_size > 0) { + GELOGI("No need to update data input size."); + return SUCCESS; + } else { + int64_t real_dim_size = 0; + ge::graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(output_desc, real_dim_size); + if (graph_status != GRAPH_SUCCESS) { + GELOGE(FAILED, "Get tensor size in bytes failed."); + return FAILED; + } + // data op only has one input anchor + ge::GeTensorDesc input_desc = op_desc->GetInputDesc(0); + ge::TensorUtils::SetSize(input_desc, real_dim_size); + if (op_desc->UpdateInputDesc(0, input_desc) != GRAPH_SUCCESS) { + GELOGE(FAILED, "Update input desc size failed."); + return FAILED; + } + } + return SUCCESS; +} + +Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph, vector &subgraph_ptr_list) { GELOGI("[SecondPartition] second partition."); - subgraph_ptr_list.clear(); GE_TIMESTAMP_START(GraphPartition2); - Status ret = graph_partitioner_.Partition(comp_graph, subgraph_ptr_list, GraphPartitioner::kSecondPartitioning); + auto ret = graph_partitioner_.Partition(comp_graph, GraphPartitioner::kSecondPartitioning); + if (ret != SUCCESS) { + GELOGE(ret, "Graph partition Failed"); + return ret; + } GE_CHK_STATUS_RET(ret, "Graph partition Failed."); + auto graph_2_subgraphlist = graph_partitioner_.GetSubGraphMap(); + if (graph_2_subgraphlist.find(comp_graph) != graph_2_subgraphlist.end()) { + subgraph_ptr_list = graph_2_subgraphlist[comp_graph]; + } else { + GELOGE(FAILED, "Find subgraph failed."); + return FAILED; + } GE_TIMESTAMP_END(GraphPartition2, "GraphPartitioner::Partition2"); return ret; } diff --git a/src/ge/graph/build/graph_build.h b/src/ge/graph/build/graph_builder.h similarity index 97% rename from src/ge/graph/build/graph_build.h rename to src/ge/graph/build/graph_builder.h index b7ceec62..c1c4f7b6 100644 --- a/src/ge/graph/build/graph_build.h +++ b/src/ge/graph/build/graph_builder.h @@ -55,6 +55,7 @@ class GraphBuilder { Status GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr &model_ptr, ComputeGraphPtr &comp_graph, std::vector &subgraph_ptr_list, uint64_t session_id = INVALID_SESSION_ID); Status SetInputSize(const ge::NodePtr &node_ptr); + Status UpdateDataInputSize(const ge::NodePtr &node_ptr); Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector &subgraph_ptr_list); int build_mode_; diff --git a/src/ge/graph/build/label_allocator.cc b/src/ge/graph/build/label_allocator.cc new file mode 100644 index 00000000..46c092f5 --- /dev/null +++ b/src/ge/graph/build/label_allocator.cc @@ -0,0 +1,80 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "label_allocator.h" + +#include "framework/common/types.h" +#include "common/util.h" +#include "common/ge_inner_error_codes.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" +#include "graph/label/label_maker.h" + +namespace ge { + +LabelAllocator::LabelAllocator(const ComputeGraphPtr &graph) : compute_graph_(graph) {} + +Status LabelAllocator::AssignFunctionalLabels(uint32_t &label_index) { + if (compute_graph_ == nullptr) { + GELOGE(INTERNAL_ERROR, "ComputeGraph not set, Assign labels failed."); + return INTERNAL_ERROR; + } + + // Add label task for sub graph. + GELOGI("AssignFunctionalLabels start: %s.", compute_graph_->GetName().c_str()); + std::set functional_nodes; + for (auto graph : compute_graph_->GetAllSubgraphs()) { + if (!CollectFunctionalNode(graph, functional_nodes)) { + return INTERNAL_ERROR; + } + } + + // Add label for functional op. + label_index = 0; + for (auto node : functional_nodes) { + LabelMakerPtr maker = LabelMakerFactory::Instance().Create(node->GetType(), compute_graph_, node); + if (maker == nullptr) { + GELOGE(INTERNAL_ERROR, "Node: %s label maker not registed.", node->GetType().c_str()); + return INTERNAL_ERROR; + } + + if (maker->Run(label_index) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Node: %s run label maker failed.", node->GetType().c_str()); + return INTERNAL_ERROR; + } + } + + GELOGI("AssignFunctionalLabels success."); + return SUCCESS; +} + +bool LabelAllocator::CollectFunctionalNode(ComputeGraphPtr &graph, std::set &functional_nodes) { + if (graph == nullptr) { + GELOGE(INTERNAL_ERROR, "Sub ComputeGraph is null."); + return false; + } + + NodePtr parent = graph->GetParentNode(); + if (parent == nullptr) { + GELOGE(INTERNAL_ERROR, "ComputeGraph owner not set: %s.", graph->GetName().c_str()); + return false; + } + + (void)functional_nodes.insert(parent); // unique functional node. + return true; +} + +} // namespace ge diff --git a/src/ge/graph/build/label_allocator.h b/src/ge/graph/build/label_allocator.h new file mode 100644 index 00000000..01811e1d --- /dev/null +++ b/src/ge/graph/build/label_allocator.h @@ -0,0 +1,39 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_LABEL_ALLOCATOR_H_ +#define GE_GRAPH_LABEL_ALLOCATOR_H_ + +#include + +#include "graph/node.h" +#include "external/ge/ge_api_error_codes.h" + +namespace ge { +class LabelAllocator { + public: + explicit LabelAllocator(const ComputeGraphPtr &graph); + ~LabelAllocator() = default; + + Status AssignFunctionalLabels(uint32_t &label_index); + + private: + bool CollectFunctionalNode(ComputeGraphPtr &graph, std::set &functional_nodes); + + ComputeGraphPtr compute_graph_; +}; +} // namespace ge +#endif // GE_GRAPH_LABEL_ALLOCATOR_H_ \ No newline at end of file diff --git a/src/ge/graph/build/logical_stream_allocator.cc b/src/ge/graph/build/logical_stream_allocator.cc index 509f591f..2b11347b 100644 --- a/src/ge/graph/build/logical_stream_allocator.cc +++ b/src/ge/graph/build/logical_stream_allocator.cc @@ -27,6 +27,12 @@ using std::set; using std::string; using std::vector; +namespace { +const char *const kAICPUEngineName = "DNN_VM_AICPU"; +const char *const kAttrNameParentOpType = "parentOpType"; +const size_t kHeadNodeMaxNum = 820; // calculated by 1024 * 0.8 +} // namespace + namespace ge { LogicalStreamPass::LogicalStreamPass(const string &name) : name_(name) {} @@ -46,6 +52,24 @@ bool LogicalStreamPass::HasAssignedStream(const Subgraph &subgraph) const { return subgraph.stream_id != kInvalidStream; } +bool LogicalStreamPass::HasNonConstInputNode(const Subgraph &subgraph) const { + const SubGraphInfo &subgraph_info = subgraph.subgraph_info; + const auto &pld_to_end_map = subgraph_info.GetPld2EndMap(); + for (const auto &pld_to_end : pld_to_end_map) { + const NodePtr &placeholder = pld_to_end.first; + if (placeholder != nullptr) { + string parent_op_type; + if (AttrUtils::GetStr(placeholder->GetOpDesc(), kAttrNameParentOpType, parent_op_type)) { + if ((parent_op_type != CONSTANT) && (parent_op_type != CONSTANTOP)) { + return true; + } + } + } + } + + return false; +} + Status AssignByLabelPass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, Context &context) { bool changed = false; int64_t &next_stream = context.next_stream; @@ -108,6 +132,21 @@ Status IndependentStreamPass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, Context &context) { bool changed = false; + if (IsHeadNodeExceeded(subgraphs)) { + int64_t &next_stream = context.next_stream; + for (const SubgraphPtr &subgraph : subgraphs) { + if (!HasAssignedStream(*subgraph)) { + subgraph->stream_id = next_stream; + changed = true; + } + } + if (changed) { + ++next_stream; + return SUCCESS; + } + return NOT_CHANGED; + } + map end_subgraph_map; map pld_subgraph_map; InitEndSubgraphMap(subgraphs, end_subgraph_map); @@ -150,6 +189,24 @@ Status AssignByDependencyPass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs) const { + size_t aicpu_node_num = 0; + for (const SubgraphPtr &subgraph : subgraphs) { + if (subgraph->engine_conf.id == kAICPUEngineName && !HasNonConstInputNode(*subgraph)) { + const SubGraphInfo &subgraph_info = subgraph->subgraph_info; + auto compute_graph = subgraph_info.GetSubGraph(); + aicpu_node_num += compute_graph->GetDirectNode().size() - subgraph_info.GetPld2EndMap().size() - + subgraph_info.GetEnd2PldMap().size(); + if (aicpu_node_num > kHeadNodeMaxNum) { + GELOGI("aicpu_node_num, %zu", aicpu_node_num); + return true; + } + } + } + + return false; +} + void AssignByDependencyPass::InitEndSubgraphMap(const vector &subgraphs, map &end_subgraph_map) { for (const auto &subgraph : subgraphs) { @@ -266,6 +323,7 @@ void AssignByDependencyPass::UpdateAssignedSubgraphs(Context &context) { // Update the subgraphs assigned by the engine. for (auto &subgraph : assigned_subgraphs_) { subgraph->stream_id += engine_start_streams[subgraph->engine_conf.id]; + GELOGI("Stream of subgraph %s has been updated to %ld.", subgraph->name.c_str(), subgraph->stream_id); } } @@ -319,6 +377,8 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vectorGetName().c_str(), + GELOGI("Node %s of type %s: its all input and output nodes are in same stream[%ld].", node->GetName().c_str(), node->GetType().c_str(), stream_id); return stream_id; } @@ -419,8 +479,8 @@ Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole // Check if sub graph is engine skipped and without stream label or not for (const SubgraphPtr &subgraph : subgraphs) { if (IsEngineSkip(*subgraph) && !HasStreamLabel(*subgraph)) { - auto compute_graph = subgraph->subgraph_info.GetSubGraph(); - for (NodePtr &node : compute_graph->GetDirectNode()) { + auto graph = subgraph->subgraph_info.GetSubGraph(); + for (NodePtr &node : graph->GetDirectNode()) { auto op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); auto stream_id = op_desc->GetStreamId(); @@ -465,6 +525,42 @@ bool NodeStreamUpdatePass::AreAllPredStreamsInvalid(const NodePtr &node) const { return true; } +void NodeStreamUpdatePass::RefreshContinuousStreams(ComputeGraphPtr whole_graph, Context &context) const { + int64_t stream_num = context.next_stream; + vector stream_has_node(stream_num); + + for (const NodePtr &node : whole_graph->GetDirectNode()) { + if (node != nullptr) { + auto op_desc = node->GetOpDesc(); + if (op_desc != nullptr) { + int64_t stream_id = op_desc->GetStreamId(); + if (stream_id != kInvalidStream && stream_id < stream_num) { + stream_has_node[stream_id] = true; + } + } + } + } + + context.next_stream = 0; + vector old_to_new_streams(stream_num, kInvalidStream); + for (size_t old_stream = 0; old_stream < stream_has_node.size(); ++old_stream) { + if (stream_has_node[old_stream]) { + old_to_new_streams[old_stream] = context.next_stream; + ++context.next_stream; + } + } + + for (const NodePtr &node : whole_graph->GetDirectNode()) { + auto op_desc = node->GetOpDesc(); + if (op_desc != nullptr) { + int64_t stream_id = op_desc->GetStreamId(); + if (stream_id != kInvalidStream && stream_id < stream_num) { + op_desc->SetStreamId(old_to_new_streams[stream_id]); + } + } + } +} + LogicalStreamAllocator::LogicalStreamAllocator(const map &scheduler_confs, const map &max_parallel_num, bool hcom_parallel) : scheduler_confs_(scheduler_confs), max_parallel_num_(max_parallel_num) { @@ -475,6 +571,7 @@ Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const int64_t &stream_num) { GE_CHECK_NOTNULL(whole_graph); map engine_confs; + GE_TIMESTAMP_START(InitEngineConfs); for (const auto &item : scheduler_confs_) { const SchedulerConf &scheduler = item.second; for (const auto &engine_pair : scheduler.cal_engines) { @@ -484,9 +581,12 @@ Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const } } } + GE_TIMESTAMP_END(InitEngineConfs, "GraphBuilder::AssignStreamInitEngineConfs"); vector subgraphs; + GE_TIMESTAMP_START(ConvertSubgraphs); Status status = ConvertSubgraphs(subgraph_infos, engine_confs, subgraphs); + GE_TIMESTAMP_END(ConvertSubgraphs, "GraphBuilder::AssignStreamConvertSubgraphs"); if (status != SUCCESS) { GELOGE(status, "Create subgraphs failed."); return status; diff --git a/src/ge/graph/build/logical_stream_allocator.h b/src/ge/graph/build/logical_stream_allocator.h index 83c5f668..2265a0f3 100644 --- a/src/ge/graph/build/logical_stream_allocator.h +++ b/src/ge/graph/build/logical_stream_allocator.h @@ -80,6 +80,9 @@ class LogicalStreamPass { bool HasStreamLabel(const Subgraph &subgraph) const; bool HasAssignedStream(const Subgraph &subgraph) const; + // Determine if the input of the subgraph is a constant. + bool HasNonConstInputNode(const Subgraph &subgraph) const; + private: std::string name_; }; @@ -117,6 +120,7 @@ class AssignByDependencyPass : public LogicalStreamPass { void UpdateAssignedSubgraphs(Context &context); void UpdateReusedSubgraphs(); + bool IsHeadNodeExceeded(const std::vector &subgraphs) const; bool CouldReuse(const SubgraphPtr &subgraph, const SubgraphPtr &pred_subgraph, const std::map &pld_subgraph_map); @@ -151,6 +155,7 @@ class NodeStreamUpdatePass : public LogicalStreamPass { int64_t GetSingleInoutStream(const NodePtr &node) const; // Judge if all predecessors' streams of node are INVALID_STREAM bool AreAllPredStreamsInvalid(const NodePtr &node) const; + void RefreshContinuousStreams(ComputeGraphPtr whole_graph, Context &context) const; }; // AllReduce and backward operators execute in parallel. diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc index 70b36ad1..77860e4d 100644 --- a/src/ge/graph/build/memory/block_mem_assigner.cc +++ b/src/ge/graph/build/memory/block_mem_assigner.cc @@ -33,12 +33,16 @@ #include "graph/optimize/common/params.h" #include "omg/omg_inner_types.h" +#include "runtime/mem.h" namespace { const char *const kAttrNameWorkspaceReuseFlag = "workspace_reuse_flag"; const char *const kL2FusionDynamicConvergeOp = "l2fusion_dynamic_converge_op"; const char *const kDisableReuseMemory = "ge.exec.disableReuseMemory"; +const char *const OP_NO_REUSE_MEM = "OP_NO_REUSE_MEM"; const int kReuseMaxCount = 10; +const int kReuseMaxOpNum = 10; +const int kReuseMaxCharNum = 2000; } // namespace namespace ge { @@ -57,8 +61,8 @@ void MemoryBlock::Resize() { return; } else { size_t block_size = *iter; - if ((block_size > 0) && (block_size % kMemAlignSize != 0)) { - block_size = (block_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize; + if ((block_size > 0) && (block_size % MEM_ALIGN_SIZE != 0)) { + block_size = (block_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; } block_size_ = block_size; } @@ -69,7 +73,7 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) { return false; } - auto node_op_desc = node_type_index_list_[0].node_->GetOpDesc(); + auto node_op_desc = node_type_index_list_[0].node->GetOpDesc(); if (node_op_desc == nullptr) { return false; } @@ -80,11 +84,11 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) { } bool all_same_label = true; for (size_t index = 1; index < node_type_index_list_.size(); ++index) { - if (node_type_index_list_[index].node_ == nullptr) { + if (node_type_index_list_[index].node == nullptr) { continue; } std::string batch_label; - auto index_op_desc = node_type_index_list_[index].node_->GetOpDesc(); + auto index_op_desc = node_type_index_list_[index].node->GetOpDesc(); GE_IF_BOOL_EXEC(index_op_desc == nullptr, continue); (void)ge::AttrUtils::GetStr(index_op_desc, ATTR_NAME_BATCH_LABEL, batch_label); if (first_batch_label != batch_label) { @@ -95,15 +99,35 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) { return all_same_label; } +void SetLastUsedInputMemAttr(NodePtr &node, int input_index) { + if (node == nullptr) { + return; + } + auto node_op_desc = node->GetOpDesc(); + if (node_op_desc != nullptr) { + auto input_desc = node_op_desc->GetInputDesc(input_index); + if (!ge::AttrUtils::SetInt(input_desc, ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE, true)) { + GELOGW("Set %s input[%d] ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE to true failed.", node_op_desc->GetName().c_str(), + input_index); + return; + } + GELOGD("Set %s input[%d] ATTR_NAME_IS_END_OF_INPUTMEM_LIFECYCLE to true success.", node_op_desc->GetName().c_str(), + input_index); + if (node_op_desc->UpdateInputDesc(input_index, input_desc) != GRAPH_SUCCESS) { + GELOGW("Update %s input[%d] desc failed.", node_op_desc->GetName().c_str(), input_index); + } + } +} + string ToString(ge::NodeTypeIndex &x) { stringstream ss; - ss << "[" << x.node_->GetName() << "(" << x.node_->GetType() << "), "; - if (x.mem_type_ == kOutput) { + ss << "[" << x.node->GetName() << "(" << x.node->GetType() << "), "; + if (x.mem_type == kOutput) { ss << "Output, "; } else { ss << "Workspace, "; } - ss << x.index_ << "]"; + ss << x.index << "]"; return ss.str(); } @@ -140,7 +164,7 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector &all_memory_size) { GELOGI("Get reuse_input failed")); if (!reuse_input) { - uint32_t size = 0; + int64_t size = 0; GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_desc, size) != SUCCESS, GELOGI("Get size failed")); all_memory_size.emplace_back(size); } @@ -182,13 +206,8 @@ size_t GetBlockSize(size_t size, const vector &ranges) { bool IsDirectOutputNode(const NodePtr &node, int idx) { if ((node != nullptr) && (node->GetOpDesc() != nullptr) && (node->GetOpDesc()->GetType() == NETOUTPUT)) { - auto op_desc = node->GetOpDesc(); - auto input_desc = op_desc->MutableInputDesc(idx); - auto output_desc = op_desc->MutableOutputDesc(idx); - if ((input_desc != nullptr) && (output_desc != nullptr) && (input_desc->GetFormat() == output_desc->GetFormat()) && - (input_desc->GetDataType() == output_desc->GetDataType())) { - return true; - } + GELOGI("This is netoutput node, the input node mem can not be reused"); + return true; } return false; } @@ -241,7 +260,8 @@ bool CanReuseByStream(const std::unordered_set &reuse_stream, MemoryBlo } MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, MemoryType mem_type, const NodePtr &n, - uint32_t out_index, const vector &workspace_reuse_flag) { + uint32_t out_index, const vector &workspace_reuse_flag, + const bool is_op_reuse_mem) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null."); auto node_op_desc = n->GetOpDesc(); GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr); @@ -250,10 +270,8 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, (void)ge::GetContext().GetOption(kDisableReuseMemory, ge_disable_reuse_mem_env); if (ge_disable_reuse_mem_env != "1") { int64_t convergence_label; - bool reuse_mem_flag = true; - if ((workspace_reuse_flag.size() > out_index) && (workspace_reuse_flag[out_index] == false)) { - reuse_mem_flag = false; - } + bool reuse_mem_flag = + ((workspace_reuse_flag.size() > out_index) && (workspace_reuse_flag[out_index] == false)) ? false : true; if (!ge::AttrUtils::GetInt(node_op_desc, kL2FusionDynamicConvergeOp, convergence_label)) { bool out_flg = false; GE_IF_BOOL_EXEC(n->GetOutDataNodes().empty(), out_flg = true); @@ -262,14 +280,14 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, if (IsDirectOutputNode(in_anchor->GetOwnerNode(), in_anchor->GetIdx())) { out_flg = true; break; - } else { - break; } } auto op_type = node_op_desc->GetType(); bool is_reuse_memory = !out_flg && reuse_mem_flag && (op_type != DATA_TYPE) && (op_type != AIPP_DATA_TYPE) && (op_type != CONSTANT) && (op_type != NETOUTPUT) && (op_type != PROPOSAL) && - (op_type != ANN_DATA_TYPE) && (op_type != ZEROSLIKE) && (op_type != CONSTANTOP); + (op_type != ANN_DATA_TYPE) && (op_type != ZEROSLIKE) && (op_type != CONSTANTOP) && + is_op_reuse_mem; + auto stream_id = node_op_desc->GetStreamId(); auto map_iter = reusable_streams_map_.find(stream_id); if (is_reuse_memory && map_iter != reusable_streams_map_.end()) { @@ -277,7 +295,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, MemoryBlock *reusable_block = *it; bool is_data = false; for (auto node_type : reusable_block->NodeTypeIndexList()) { - GE_IF_BOOL_EXEC(node_type.node_ != nullptr, string type = node_type.node_->GetType(); + GE_IF_BOOL_EXEC(node_type.node != nullptr, string type = node_type.node->GetType(); bool flag = (type == DATA_TYPE) || (type == ENTER) || (type == REFENTER) || (type == AIPP_DATA_TYPE) || (type == NEXTITERATION) || (type == REFNEXTITERATION); @@ -302,7 +320,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, } } - auto block = new (std::nothrow) MemoryBlock(block_size); + auto block = new (std::nothrow) MemoryBlock(block_size, is_op_reuse_mem); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "new an object failed."); block->Init(real_size, mem_type, n, out_index); @@ -312,14 +330,15 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, return block; } -MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, const vector &ranges) { +MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, const vector &ranges, + const bool is_op_reuse_mem) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); auto node_op_desc = n->GetOpDesc(); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); MemoryBlock *block = nullptr; bool reuse_input = false; uint32_t reuse_input_index = 0; - uint32_t size = 0; + int64_t size = 0; auto output_op_desc = node_op_desc->GetOutputDescPtr(index); if (output_op_desc != nullptr) { GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInput(*output_op_desc, reuse_input) != SUCCESS, @@ -343,7 +362,7 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index, } else { auto block_size = GetBlockSize(size, ranges); vector workspace_reuse_flag; - block = ApplyMemory(block_size, size, kOutput, n, index, workspace_reuse_flag); + block = ApplyMemory(block_size, size, kOutput, n, index, workspace_reuse_flag, is_op_reuse_mem); } GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "Block is nullptr."); int out_count_reuse_input = block->ref_count_; @@ -430,6 +449,7 @@ bool IsReferencePreviousNodeOutputMemory(const ge::NodePtr &node, uint32_t outpu void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector &reusable_memory) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null."); GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory"); + GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory"); --to_release->ref_count_; if (to_release->ref_count_ == 0) { reusable_memory.emplace_back(to_release); @@ -444,12 +464,11 @@ void BlockMemAssigner::ReleaseMemorys(const vector &to_releases, } } -void BlockMemAssigner::ReleaseInputNodeOutMemory(const NodePtr &n, - const unordered_map> &node_out_blocks, - vector &reusable_memory) { - for (const auto &in_anchor : n->GetAllInDataAnchors()) { +void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_map> &node_out_blocks, + vector &reusable_memory, NodePtr &node) { + for (const auto &in_anchor : node->GetAllInDataAnchors()) { if ((in_anchor->GetPeerOutAnchor() == nullptr) || - (in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr) || (n->GetOpDesc() == nullptr)) { + (in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr) || (node->GetOpDesc() == nullptr)) { return; } GE_IF_BOOL_EXEC(IsOutputBlock(in_anchor), continue); @@ -470,16 +489,102 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const NodePtr &n, if (node_type_indexs.empty()) { continue; } - GELOGD("node_type_indexs: %d, %s", node_type_indexs.back().index_, - node_type_indexs.back().node_->GetName().c_str()); + GELOGD("node_type_indexs: %d, %s", node_type_indexs.back().index, + node_type_indexs.back().node->GetName().c_str()); - if ((node_type_indexs.back().node_ == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && - (node_type_indexs.back().index_ == static_cast(in_anchor->GetPeerOutAnchor()->GetIdx())) && - n->GetOpDesc()->GetStreamId() == block->stream_id_) { + if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && + (node_type_indexs.back().index == static_cast(in_anchor->GetPeerOutAnchor()->GetIdx())) && + (node->GetOpDesc()->GetStreamId() == block->stream_id_)) { ReleaseMemory(block, reusable_memory); + if (block->ref_count_ == 0) { + SetLastUsedInputMemAttr(node, in_anchor->GetIdx()); + } + } + } + } +} + +void SplitStringByComma(const string &str, vector &sub_str_vec) { + std::string tmp_string = str + ","; + std::string::size_type start_pos = 0; + std::string::size_type cur_pos = tmp_string.find(',', 0); + while (cur_pos != std::string::npos) { + std::string sub_str = tmp_string.substr(start_pos, cur_pos - start_pos); + if (!sub_str.empty()) { + vector::iterator ret = std::find(sub_str_vec.begin(), sub_str_vec.end(), sub_str); + if (ret == sub_str_vec.end()) { + sub_str_vec.push_back(sub_str); } } + start_pos = cur_pos + 1; + cur_pos = tmp_string.find(',', start_pos); + } +} + +void CheckAndGetOpReuseEnv(const string &env, vector &env_vec, bool &op_reuse_env_valid) { + string env_str; + env_str = string(env); + if (env_str.size() > kReuseMaxCharNum) { + GELOGE(FAILED, "The OP_NO_REUSE_MEM has more than %d characters.", kReuseMaxCharNum); + return; + } + + SplitStringByComma(env_str, env_vec); + if (env_vec.size() > kReuseMaxOpNum) { + GELOGE(FAILED, "The OP_NO_REUSE_MEM has more than %d nodes.", kReuseMaxOpNum); + return; + } + + op_reuse_env_valid = true; + return; +} + +Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector &ranges) { + auto node_op_desc = node->GetOpDesc(); + int64_t stream_id = node_op_desc->GetStreamId(); + vector memorys_type; + bool has_mem_type_attr = ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memorys_type); + GELOGI("Assign memory node[%s], output size[%d], output memory type size[%d]", node_op_desc->GetName().c_str(), + node_op_desc->GetOutputsSize(), memorys_type.size()); + if (has_mem_type_attr && (memorys_type.size() != node_op_desc->GetOutputsSize())) { + GELOGE(INTERNAL_ERROR, "L1fusion: node[%s], output memory size err[outputsize:%zu, memorysize:%zu]", + node_op_desc->GetName().c_str(), node_op_desc->GetOutputsSize(), memorys_type.size()); + return INTERNAL_ERROR; + } + if (op_reuse_env_valid_ == true) { + vector::iterator it_name = + std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), node_op_desc->GetName()); + vector::iterator it_type = + std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), node_op_desc->GetType()); + GE_IF_BOOL_EXEC(it_name != op_no_reuse_mem_vec_.end() || it_type != op_no_reuse_mem_vec_.end(), + is_op_reuse_mem_ = false;); + } + + // Allocate memory for the current node and release node memory of the same size in the workspace + GE_IF_BOOL_EXEC(ge_disable_reuse_mem_env_ != "1", + ReleaseMemorys(stream_workspace_blocks_[stream_id], reusable_blocks_);) + for (uint32_t i = 0; i < static_cast(node_op_desc->GetOutputsSize()); i++) { + int64_t size = 0; + auto output_op_desc = node_op_desc->GetOutputDescPtr(i); + if (output_op_desc != nullptr) { + GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS, GELOGI("Get size failed")); + } + // l1 fusion: l1 type's size not means malloc HBM memory + if (has_mem_type_attr && memorys_type[i] != RT_MEMORY_HBM) { + GELOGI("L1fusion: node[%s], output[%s], output memory type [%d]", node_op_desc->GetName().c_str(), + node_op_desc->GetOutputNameByIndex(i).c_str(), memorys_type[i]); + size = 0; + } + if ((size == 0) || CheckIsZeroMemNodeType(node->GetType()) || IsReferencePreviousNodeOutputMemory(node, i)) { + zero_memory_list_.emplace_back(node, kOutput, i); + continue; + } + MemoryBlock *mem_block = ApplyOutMemory(node, i, ranges, is_op_reuse_mem_); + if (mem_block != nullptr) { + node_out_blocks_[node->GetName()].emplace_back(mem_block); + } } + return SUCCESS; } /// @@ -491,57 +596,60 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const NodePtr &n, void BlockMemAssigner::AssignMemoryWithReuse(vector &ranges) { // Init reusable streams map InitReusableStreamMap(); - string ge_disable_reuse_mem_env = "0"; - (void)ge::GetContext().GetOption("ge.exec.disableReuseMemory", ge_disable_reuse_mem_env); - if (ge_disable_reuse_mem_env == "1") { - GEEVENT("Reuse memory close"); - } else { - GEEVENT("Reuse memory open"); - } + (void)ge::GetContext().GetOption("ge.exec.disableReuseMemory", ge_disable_reuse_mem_env_); - for (const NodePtr &n : compute_graph_->GetDirectNode()) { + GEEVENT("Reuse memory %s", ge_disable_reuse_mem_env_ == "1" ? "close" : "open"); + string op_no_reuse_mem_str; + const char *op_no_reuse_mem = std::getenv(OP_NO_REUSE_MEM); + GE_IF_BOOL_EXEC(op_no_reuse_mem != nullptr, op_no_reuse_mem_str = string(op_no_reuse_mem); + CheckAndGetOpReuseEnv(op_no_reuse_mem_str, op_no_reuse_mem_vec_, op_reuse_env_valid_);); + + for (NodePtr &n : compute_graph_->GetDirectNode()) { auto node_op_desc = n->GetOpDesc(); GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); int64_t stream_id = node_op_desc->GetStreamId(); - - // Allocate memory for the current node and release node memory of the same size in the workspace - GE_IF_BOOL_EXEC(ge_disable_reuse_mem_env != "1", - ReleaseMemorys(stream_workspace_blocks_[stream_id], reusable_blocks_);) - for (uint32_t i = 0; i < static_cast(node_op_desc->GetOutputsSize()); i++) { - uint32_t size = 0; - auto output_op_desc = node_op_desc->GetOutputDescPtr(i); - if (output_op_desc != nullptr) { - GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS, GELOGI("Get size failed")); - } - if ((size == 0) || CheckIsZeroMemNodeType(n->GetType()) || IsReferencePreviousNodeOutputMemory(n, i)) { - zero_memory_list_.emplace_back(n, kOutput, i); - continue; - } - MemoryBlock *mem_block = ApplyOutMemory(n, i, ranges); - if (mem_block != nullptr) { - node_out_blocks_[n->GetName()].emplace_back(mem_block); - } + if (AssignOutputMemoryWithReuse(n, ranges) != SUCCESS) { + return; } stream_workspace_blocks_[stream_id].clear(); vector temp; GetNodeWorkSpaceSize(n, temp); + vector workspace_bytes; + vector workspace_memory_type; + bool has_workspace_mem_type_attr = + ge::AttrUtils::GetListInt(node_op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_memory_type); vector workspace_reuse_flag; GE_IF_BOOL_EXEC(!ge::AttrUtils::GetListBool(node_op_desc, kAttrNameWorkspaceReuseFlag, workspace_reuse_flag), GELOGD("OP %s get workspace_reuse_flag attr failed", node_op_desc->GetName().c_str())); + GELOGI("Assign memory node[%s], size [temp:%zu, memory type size:%zu]", node_op_desc->GetName().c_str(), + temp.size(), workspace_memory_type.size()); + + if (has_workspace_mem_type_attr && (temp.size() != workspace_memory_type.size())) { + GELOGE(INTERNAL_ERROR, "L1fusion: node[%s], workspace_memory size err![v_temp:%zu, workspace:%zu]", temp.size(), + workspace_memory_type.size()); + return; + } for (size_t i = 0; i < temp.size(); i++) { - if (temp[i] == 0) { + // l1 fusion: l1 type's size not means malloc HBM memory + bool workspace_skip_flag = false; + if (has_workspace_mem_type_attr && workspace_memory_type[i] != RT_MEMORY_HBM) { + GELOGI("L1fusion: node[%s]workspace index[%d] is l1 type, add to zero_memory_list, workspace memory type [%ld]", + node_op_desc->GetName().c_str(), i, workspace_memory_type[i]); + workspace_skip_flag = true; + } + if (temp[i] == 0 || workspace_skip_flag) { zero_memory_list_.emplace_back(n, kWorkspace, static_cast(i)); continue; } MemoryBlock *mem_block = ApplyMemory(GetBlockSize(static_cast(temp[i]), ranges), static_cast(temp[i]), kWorkspace, n, - static_cast(i), workspace_reuse_flag); + static_cast(i), workspace_reuse_flag, is_op_reuse_mem_); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mem_block == nullptr, continue, "failed to apply memory block."); CheckWorkspaceReuse(workspace_reuse_flag, i, stream_id, mem_block); } - ReleaseInputNodeOutMemory(n, node_out_blocks_, reusable_blocks_); + ReleaseInputNodeOutMemory(node_out_blocks_, reusable_blocks_, n); } GELOGD("Assigned memory blocks:"); @@ -550,7 +658,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector &ranges) { (void)mem_block; // Fix warning } - GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env == "1"), MergeDynamicBatchBlocks();) + GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), MergeDynamicBatchBlocks();) ResizeMemoryBlocks(); GELOGD("Memory blocks after resize:"); @@ -669,11 +777,14 @@ void BlockMemAssigner::ResizeMemoryBlocks() { /// @return Status result /// void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t size, size_t real_size) { - ge::OpDescPtr op_desc = node_type_index.node_->GetOpDesc(); + ge::OpDescPtr op_desc = node_type_index.node->GetOpDesc(); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(op_desc == nullptr, return, "op_desc is null."); - if (node_type_index.mem_type_ == kOutput) { + string graph_name = node_type_index.node->GetOwnerComputeGraph()->GetName(); + vector memorys_type; + bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memorys_type); + if (node_type_index.mem_type == kOutput) { vector output_list = op_desc->GetOutputOffset(); - for (auto i = static_cast(output_list.size()); i < node_type_index.index_ + 1; i++) { + for (auto i = static_cast(output_list.size()); i < node_type_index.index + 1; i++) { output_list.emplace_back(kInvalidOffset); } if (output_list.empty()) { @@ -683,21 +794,32 @@ void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t if ((op_desc->GetType() == DATA) || (op_desc->GetType() == AIPP_DATA_TYPE) || (op_desc->GetType() == MULTISHAPE) || (op_desc->GetType() == NETOUTPUT)) { - if ((output_list[node_type_index.index_] == kInvalidOffset) || (output_list[node_type_index.index_] < offset)) { - output_list.at(node_type_index.index_) = offset; + if ((output_list[node_type_index.index] == kInvalidOffset) || (output_list[node_type_index.index] < offset)) { + output_list.at(node_type_index.index) = offset; } } else { - output_list.at(node_type_index.index_) = offset; + // l1 fusion: keep the original offset value from op_desc + bool set_out_offset = (!has_mem_type_attr) || (memorys_type[node_type_index.index] == RT_MEMORY_HBM); + if (set_out_offset) { + output_list.at(node_type_index.index) = offset; + } } - op_desc->SetOutputOffset(output_list); - } else if (node_type_index.mem_type_ == kWorkspace) { + } else if (node_type_index.mem_type == kWorkspace) { vector workspace_list; workspace_list = op_desc->GetWorkspace(); - for (auto i = static_cast(workspace_list.size()); i < node_type_index.index_ + 1; i++) { + for (auto i = static_cast(workspace_list.size()); i < node_type_index.index + 1; i++) { workspace_list.emplace_back(kInvalidOffset); } - workspace_list.at(node_type_index.index_) = offset; + vector workspace_memory_type; + bool has_workspace_mem_type_attr = + ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_memory_type); + // l1 fusion: keep the original offset value from op_desc + bool set_workspace_offset = + (!has_workspace_mem_type_attr) || (workspace_memory_type[node_type_index.index] == RT_MEMORY_HBM); + if (set_workspace_offset) { + workspace_list.at(node_type_index.index) = offset; + } op_desc->SetWorkspace(workspace_list); } } @@ -786,7 +908,7 @@ void BlockMemAssigner::FindHeadAndTailNodesForStream(mapGetOpDesc()->GetOutputsSize(); i++) { - uint32_t size = 0; + int64_t size = 0; if (ge::TensorUtils::GetSize(*n->GetOpDesc()->GetOutputDescPtr(static_cast(i)), size) != SUCCESS) { GELOGW("Get output size failed!"); continue; diff --git a/src/ge/graph/build/memory/block_mem_assigner.h b/src/ge/graph/build/memory/block_mem_assigner.h index 4a826cfc..d0cb5339 100644 --- a/src/ge/graph/build/memory/block_mem_assigner.h +++ b/src/ge/graph/build/memory/block_mem_assigner.h @@ -34,19 +34,20 @@ enum MemoryType { kOutput, kWorkspace }; struct NodeTypeIndex { NodeTypeIndex(ge::NodePtr node, MemoryType mem_type, uint32_t index) - : node_(std::move(node)), mem_type_(mem_type), index_(index) {} + : node(std::move(node)), mem_type(mem_type), index(index) {} - ge::NodePtr node_ = nullptr; - MemoryType mem_type_ = kOutput; - uint32_t index_ = 0; + ge::NodePtr node = nullptr; + MemoryType mem_type = kOutput; + uint32_t index = 0; }; class MemoryBlock { public: - explicit MemoryBlock(size_t block_size) + explicit MemoryBlock(size_t block_size, bool reuse_mem = true) : ref_count_(0), stream_id_(0), deleted_block_(false), + reuse_mem_(reuse_mem), block_size_(block_size), head_offset_(0), tail_offset_(0) {} @@ -88,6 +89,7 @@ class MemoryBlock { int ref_count_; int64_t stream_id_; bool deleted_block_; + bool reuse_mem_; private: size_t block_size_; @@ -198,8 +200,10 @@ class BlockMemAssigner : public MemAssigner { /// @return MemoryBlock* /// @author /// - MemoryBlock *ApplyOutMemory(const ge::NodePtr &n, uint32_t index, const std::vector &ranges); + MemoryBlock *ApplyOutMemory(const ge::NodePtr &n, uint32_t index, const std::vector &ranges, + const bool is_op_reuse_mem); + Status AssignOutputMemoryWithReuse(const NodePtr &node, vector &ranges); /// /// @ingroup GE /// @brief Traversing the compute_graph_ to apply for memory while considering reuse @@ -213,7 +217,8 @@ class BlockMemAssigner : public MemAssigner { /// @author /// MemoryBlock *ApplyMemory(size_t block_size, size_t real_size, MemoryType mem_type, const ge::NodePtr &n, - uint32_t out_index, const std::vector &workspace_reuse_flag); + uint32_t out_index, const std::vector &workspace_reuse_flag, + const bool is_op_reuse_mem); /// /// @ingroup GE @@ -257,9 +262,8 @@ class BlockMemAssigner : public MemAssigner { /// @return void /// @author /// - void ReleaseInputNodeOutMemory(const ge::NodePtr &n, - const std::unordered_map> &node_out_blocks, - vector &reusable_memory); + void ReleaseInputNodeOutMemory(const std::unordered_map> &node_out_blocks, + vector &reusable_memory, ge::NodePtr &n); /// /// @ingroup GE @@ -279,6 +283,15 @@ class BlockMemAssigner : public MemAssigner { // save stream_id and reusable stream_ids std::unordered_map> reusable_streams_map_; + + // reuse memory + vector op_no_reuse_mem_vec_; + + bool op_reuse_env_valid_ = false; + + std::string ge_disable_reuse_mem_env_ = "0"; + + bool is_op_reuse_mem_ = true; }; } // namespace ge #endif // GE_GRAPH_BUILD_MEMORY_BLOCK_MEM_ASSIGNER_H_ diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc index f3f8494d..33e8fcad 100644 --- a/src/ge/graph/build/memory/graph_mem_assigner.cc +++ b/src/ge/graph/build/memory/graph_mem_assigner.cc @@ -17,6 +17,7 @@ #include "graph/build/memory/graph_mem_assigner.h" #include #include +#include "common/math/math_util.h" #include "framework/common/debug/ge_log.h" #include "graph/build/memory/hybrid_mem_assigner.h" #include "graph/build/memory/var_mem_assign_util.h" @@ -28,6 +29,7 @@ #include "graph/utils/type_utils.h" namespace { +const int kDataOutputIndex = 0; const int kAllInputAddrIsAtomic = -1; } // namespace namespace ge { @@ -53,16 +55,12 @@ Status VariableMemoryAssigner::AssignVarAttr2Nodes() { } Status GraphMemoryAssigner::AssignMemory() { - auto mem_assigner = std::unique_ptr(new (std::nothrow) ge::HybridMemAssigner(compute_graph_)); - if (mem_assigner == nullptr) { - GELOGE(ge::FAILED, "Alloc HybridMemAssigner failed."); - return ge::FAILED; - } - if (mem_assigner->Assign() != ge::SUCCESS) { + ge::HybridMemAssigner mem_assigner(compute_graph_); + if (mem_assigner.Assign() != ge::SUCCESS) { GELOGE(ge::FAILED, "Memory assigner failed"); return ge::FAILED; } - MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner->GetMemOffset()); + MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner.GetMemOffset()); memory_offset_.push_back(memory_offset); auto session_id = compute_graph_->GetSessionID(); @@ -95,6 +93,48 @@ ge::Status GraphMemoryAssigner::AssignVarAttr2Nodes() { return ge::SUCCESS; } +ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, + int64_t dim_index, int64_t &output_mem_size, + int64_t &batch_dim_num, int64_t &out_size) { + graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size); + if (graph_status != GRAPH_SUCCESS) { + GELOGE(FAILED, "Opdesc GetSize failed!"); + return FAILED; + } + + GeShape output_shape = output_desc->GetShape(); + std::vector output_dims = output_shape.GetDims(); + if (dim_index >= static_cast(output_dims.size())) { + GELOGE(FAILED, "Invaild value(%ld) of attr _reuse_input_on_dim_index, which is out of data range [0, %zu).", + dim_index, output_dims.size()); + return FAILED; + } + + for (int64_t index = 0; index < dim_index; index++) { + FMK_INT64_MULCHECK(batch_dim_num, output_dims[index]); + batch_dim_num *= output_dims[index]; + output_dims[index] = 1; + } + + output_shape = GeShape(output_dims); + Format out_format = output_desc->GetFormat(); + DataType data_type = output_desc->GetDataType(); + + graph_status = ge::TensorUtils::CalcTensorMemSize(output_shape, out_format, data_type, output_mem_size); + if (graph_status != GRAPH_SUCCESS) { + GELOGE(graph_status, "Opdesc CalcTensorMemSize failed!"); + return FAILED; + } + + if (output_mem_size < 0) { + GELOGE(FAILED, "After calculating tensor memory size, output_mem_size = %ld, out of data range [0, %ld]", + output_mem_size, INT64_MAX); + return FAILED; + } + + return SUCCESS; +} + Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offset) { if (memory_offset_.empty()) { GELOGE(FAILED, "memory_offset_ is empty."); @@ -103,7 +143,11 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, size_t &mem_offse GE_CHK_STATUS_RET(ReAssignContinuousMemory(is_loop_graph), "ReAssignContinuousMemory Failed!"); - GE_CHK_STATUS_RET(ReAssignVirtualConcatMemory(), "ReAssignVirtualConcatMemory Failed!"); + GE_CHK_STATUS_RET(ReAssignReuseAndNoPaddingContinuousInputMemory(), + "ReAssignReuseAndNoPaddingContinuousInputMemory Failed!"); + + GE_CHK_STATUS_RET(ReAssignReuseAndNoPaddingContinuousOutputMemory(), + "ReAssignReuseAndNoPaddingContinuousOutputMemory Failed!"); GE_CHK_STATUS_RET(ReAssignMergeMemory(), "ReAssignMergeMemory Failed!"); @@ -137,7 +181,7 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { return ret; } - memory_offset_[0].mem_offset_ += kMemAlignSize; + memory_offset_[0].mem_offset_ += MEM_ALIGN_SIZE; // Clean up atomic address, eg, hcom node vector input_indexes; @@ -210,45 +254,65 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node) // conflict between the two, we can not support it. auto peer_output_size = peer_op_desc->GetOutputsSize(); if (is_peer_output_continuous && (peer_output_size != 1)) { - GELOGE(ge::PARAM_INVALID, + GELOGE(PARAM_INVALID, "Current node %s requires continuous input, while the previous node %s requires " "continuous output. There may be conflict between the two. This node is not supported now.", node->GetOpDesc()->GetName().c_str(), peer_op_desc->GetName().c_str()); - return ge::PARAM_INVALID; + return PARAM_INVALID; } bool is_peer_reference = false; // If GetBool fail, is_peer_reference is false. - (void)ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); + (void)AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference); if (is_peer_reference) { - GELOGE(ge::PARAM_INVALID, + GELOGE(PARAM_INVALID, "Current node %s requires continuous input, while the previous node %s requires " "reference. There may be conflict between the two. This node is not supported now.", node->GetOpDesc()->GetName().c_str(), peer_op_desc->GetName().c_str()); - return ge::PARAM_INVALID; + return PARAM_INVALID; } vector output_list = peer_op_desc->GetOutputOffset(); if (peer_out_data_anchor->GetIdx() < static_cast(output_list.size())) { output_list.at(peer_out_data_anchor->GetIdx()) = memory_offset_[0].mem_offset_; } else { - GELOGE(ge::FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx()); - return ge::FAILED; + GELOGE(FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx()); + return FAILED; } peer_op_desc->SetOutputOffset(output_list); - uint32_t tensor_desc_size = 0; - if (ge::TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size) != - ge::SUCCESS) { - GELOGE(FAILED, "GetSize failed."); - return FAILED; + size_t pre_mem_offset = memory_offset_[0].mem_offset_; + std::vector offsets_for_l1_fusion = {}; + bool has_offset_attr = + AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_L1_FUSION, offsets_for_l1_fusion); + int64_t tensor_desc_size = 0; + if (has_offset_attr) { + if (peer_out_data_anchor->GetIdx() < static_cast(offsets_for_l1_fusion.size())) { + auto offset_for_l1_fusion = offsets_for_l1_fusion[peer_out_data_anchor->GetIdx()]; + memory_offset_[0].mem_offset_ += offset_for_l1_fusion; + } else { + GELOGE(FAILED, "l1 fusion: peer node %s index : %d is out of range.", peer_op_desc->GetName().c_str(), + peer_out_data_anchor->GetIdx()); + return FAILED; + } + } else { + if (TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size) != + SUCCESS) { + GELOGE(FAILED, "GetSize failed."); + return FAILED; + } + memory_offset_[0].mem_offset_ += tensor_desc_size; } - memory_offset_[0].mem_offset_ += tensor_desc_size; - AlignMemOffset(kMemAlignSize); + // If set tensor_actual_size, Memory alignment is not required. + int32_t is_tensor_actual_size = 0; + ge::AttrUtils::GetInt(peer_op_desc, ATTR_NAME_GET_TENSOR_ACTUAL_SIZE, is_tensor_actual_size); + if (is_tensor_actual_size == 0) { + AlignMemOffset(MEM_ALIGN_SIZE); + } } - return ge::SUCCESS; + return SUCCESS; } Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node) { @@ -265,7 +329,9 @@ Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { output_list[out_data_anchor->GetIdx()] = memory_offset_[0].mem_offset_; - uint32_t tensor_desc_size = 0; + size_t pre_mem_offset = memory_offset_[0].mem_offset_; + + int64_t tensor_desc_size = 0; if (ge::TensorUtils::GetSize(*(out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx())), tensor_desc_size) != ge::SUCCESS) { GELOGE(FAILED, "GetSize failed."); @@ -273,88 +339,170 @@ Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node } memory_offset_[0].mem_offset_ += tensor_desc_size; - AlignMemOffset(kMemAlignSize); + AlignMemOffset(MEM_ALIGN_SIZE); } out_op_desc->SetOutputOffset(output_list); - memory_offset_[0].mem_offset_ += kMemAlignSize; + memory_offset_[0].mem_offset_ += MEM_ALIGN_SIZE; return ge::SUCCESS; } -Status GraphMemoryAssigner::ReAssignVirtualConcatMemory() { +Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() { for (const auto &n : compute_graph_->GetAllNodes()) { - GE_CHECK_NOTNULL(n->GetOpDesc()); - if (n->GetOpDesc()->GetType() == CONCAT) { - int64_t is_node_virtual; - GE_IF_BOOL_EXEC( - !(ge::AttrUtils::GetInt(n->GetOpDesc(), "fusion_virtual_op", is_node_virtual)), // Need to change - continue;); - vector output_list = n->GetOpDesc()->GetOutputOffset(); + OpDescPtr op_desc = n->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + bool attr_continuous = false; + bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, attr_continuous); + GE_IF_BOOL_EXEC(!get_continuous_flag, continue); + bool attr_reuse = false; + bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); + GE_IF_BOOL_EXEC(!get_reuse_flag, continue); + + if (attr_reuse && attr_continuous) { + vector output_list = op_desc->GetOutputOffset(); if (output_list.empty()) { GELOGE(FAILED, "Outputoffset is empty node name:%s", n->GetName().c_str()); return FAILED; } output_list.at(0) = memory_offset_[0].mem_offset_; - n->GetOpDesc()->SetOutputOffset(output_list); - GELOGD("Set Concat %s output offset to %zu.", n->GetOpDesc()->GetName().c_str(), memory_offset_[0].mem_offset_); + op_desc->SetOutputOffset(output_list); + GELOGI("Set node %s output offset to %zu.", op_desc->GetName().c_str(), memory_offset_[0].mem_offset_); + + int64_t attr_dim_index; + bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); + if (!get_attr_dim_flag) { + GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); + return FAILED; + } size_t extra_memory_size = 0; for (const auto &in_data_anchor : n->GetAllInDataAnchors()) { auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor(); - if (peer_out_data_anchor != nullptr) { - for (const auto &next_in_data_anchor : peer_out_data_anchor->GetPeerInDataAnchors()) { - if (in_data_anchor->GetOwnerNode()->GetName() == next_in_data_anchor->GetOwnerNode()->GetName()) { - auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); - GE_CHECK_NOTNULL(peer_op_desc); - vector output_offsets = peer_op_desc->GetOutputOffset(); - if (peer_out_data_anchor->GetIdx() < static_cast(output_offsets.size())) { - output_offsets.at(peer_out_data_anchor->GetIdx()) = memory_offset_[0].mem_offset_; - } else { - GELOGE(ge::FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx()); - return ge::FAILED; - } - peer_op_desc->SetOutputOffset(output_offsets); - ge::ConstGeTensorDescPtr output_desc = peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()); - GE_CHECK_NOTNULL(output_desc); - int64_t output_mem_size = 0; - - // calculate tensor real size - GeShape output_shape = output_desc->GetShape(); - Format format = output_desc->GetFormat(); - DataType data_type = output_desc->GetDataType(); - graphStatus graph_status = - TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size); - if (graph_status != GRAPH_SUCCESS) { - GELOGE(graph_status, "CalcTensorMemSize failed!"); - return FAILED; - } - - if ((output_mem_size > UINT32_MAX) || (output_mem_size < 0)) { - GELOGE(FAILED, - "After calc virtual concat tensor memory size, output_mem_size = %ld, " - "out of data range [0, %u]", - output_mem_size, UINT32_MAX); - return FAILED; - } - - uint32_t size = static_cast(output_mem_size); - memory_offset_[0].mem_offset_ += size; - uint32_t out_size = 0; - if (ge::TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), - out_size) != ge::SUCCESS) { - GELOGE(FAILED, "GetSize failed."); - return FAILED; - } - extra_memory_size = extra_memory_size + out_size - size; - } - } + GE_CHECK_NOTNULL(peer_out_data_anchor); + auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); + GE_CHECK_NOTNULL(peer_op_desc); + vector output_offsets = peer_op_desc->GetOutputOffset(); + if (peer_out_data_anchor->GetIdx() >= static_cast(output_offsets.size())) { + GELOGE(ge::FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); + return ge::FAILED; + } + output_offsets.at(peer_out_data_anchor->GetIdx()) = memory_offset_[0].mem_offset_; + peer_op_desc->SetOutputOffset(output_offsets); + size_t pre_mem_offset = memory_offset_[0].mem_offset_; + + // calculate tensor real size of each piece of data and out size of complete data + ge::ConstGeTensorDescPtr output_desc = peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()); + GE_CHECK_NOTNULL(output_desc); + int64_t output_mem_size; + int64_t batch_dim_num = 1; + int64_t out_size; + if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) != + SUCCESS) { + GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].", + peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx()); + return FAILED; } + + memory_offset_[0].mem_offset_ += output_mem_size; + extra_memory_size = extra_memory_size + out_size - output_mem_size; + + GELOGI( + "[IMAS]Virtual node optimize : set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] " + "real_size[%ld].", + n->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx(), + pre_mem_offset, peer_op_desc->GetStreamId(), out_size, output_mem_size); } memory_offset_[0].mem_offset_ += extra_memory_size; + GELOGI("After reassign virtual input node[name:%s, type:%s] memory, memory offset = %zu.", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), memory_offset_[0].mem_offset_); } } + return SUCCESS; +} + +Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() { + for (const auto &n : compute_graph_->GetAllNodes()) { + OpDescPtr op_desc = n->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + bool attr_continuous = false; + bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, attr_continuous); + GE_IF_BOOL_EXEC(!get_continuous_flag, continue); + bool attr_reuse = false; + bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse); + GE_IF_BOOL_EXEC(!get_reuse_flag, continue); + + if (attr_reuse && attr_continuous) { + auto in_data_anchor_list = n->GetAllInDataAnchors(); + if (in_data_anchor_list.size() != 1) { + // When current node has several inputs, can't directly determine which input is the tensor for reuse. + GELOGE(FAILED, "Only one input is supported, current node %s has %zu inputs.", n->GetName().c_str(), + in_data_anchor_list.size()); + return FAILED; + } + + // 1. set memory of to be reused input tensor + auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_data_anchor); + auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc(); + GE_CHECK_NOTNULL(peer_op_desc); + vector in_node_output_offsets = peer_op_desc->GetOutputOffset(); + if (peer_out_data_anchor->GetIdx() >= static_cast(in_node_output_offsets.size())) { + GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx()); + return FAILED; + } + in_node_output_offsets.at(peer_out_data_anchor->GetIdx()) = memory_offset_[0].mem_offset_; + peer_op_desc->SetOutputOffset(in_node_output_offsets); + GELOGI("Set node %s input data offset to %zu.", op_desc->GetName().c_str(), memory_offset_[0].mem_offset_); + + // 2. set memory of output tensor + vector output_list = op_desc->GetOutputOffset(); + if (output_list.empty()) { + GELOGE(FAILED, "Outputoffset is empty, node name: %s", n->GetName().c_str()); + return FAILED; + } + if (op_desc->GetOutputsSize() > output_list.size()) { + GELOGE(FAILED, "The size %zu of op_desc is more than output_list's size %zu.", op_desc->GetOutputsSize(), + output_list.size()); + return FAILED; + } + int64_t attr_dim_index; + bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index); + if (!get_attr_dim_flag) { + GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed."); + return FAILED; + } + + size_t extra_memory_size = 0; + for (auto &out_data_anchor : n->GetAllOutDataAnchors()) { + output_list[out_data_anchor->GetIdx()] = memory_offset_[0].mem_offset_; + size_t pre_mem_offset = memory_offset_[0].mem_offset_; + + // calculate tensor real size of each piece of data and out size of complete data + ge::ConstGeTensorDescPtr output_desc = op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()); + GE_CHECK_NOTNULL(output_desc); + int64_t output_mem_size; + int64_t batch_dim_num = 1; + int64_t out_size; + if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) != + SUCCESS) { + GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].", + op_desc->GetName().c_str(), out_data_anchor->GetIdx()); + return FAILED; + } - GELOGI("After reassign virtual concat memory, memoffset = %zu.", memory_offset_[0].mem_offset_); + memory_offset_[0].mem_offset_ += output_mem_size; + extra_memory_size = extra_memory_size + out_size - output_mem_size; + + GELOGI("[IMAS]Virtual node optimize : set %s name[%s] output[%d] offset to [%zu], size[%ld], real_size[%ld].", + n->GetOwnerComputeGraph()->GetName().c_str(), op_desc->GetName().c_str(), out_data_anchor->GetIdx(), + pre_mem_offset, out_size, output_mem_size); + } + op_desc->SetOutputOffset(output_list); + memory_offset_[0].mem_offset_ += extra_memory_size; + GELOGI("After reassign virtual output node[name:%s, type:%s] memory, memory offset = %zu.", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), memory_offset_[0].mem_offset_); + } + } return SUCCESS; } @@ -510,7 +658,118 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) { return SUCCESS; } +Status GraphMemoryAssigner::AssignSubgraphInputsMemory() { + GE_CHECK_NOTNULL(compute_graph_); + for (ComputeGraphPtr &graph : compute_graph_->GetAllSubgraphs()) { + GE_CHECK_NOTNULL(graph); + const NodePtr &parent_node = graph->GetParentNode(); + GE_CHECK_NOTNULL(parent_node); + const OpDescPtr &parent_desc = parent_node->GetOpDesc(); + GE_CHECK_NOTNULL(parent_desc); + + const vector input_offsets = parent_desc->GetInputOffset(); + GELOGI("SubGraph: %s graph input size: %u, parent input size: %zu, parent input offset: %zu.", + graph->GetName().c_str(), graph->GetInputSize(), parent_desc->GetInputsSize(), input_offsets.size()); + if (parent_desc->GetInputsSize() < graph->GetInputSize()) { + GELOGE(FAILED, "SubGraph: %s Input size: %u is grater than parent input size: %zu.", graph->GetName().c_str(), + graph->GetInputSize(), parent_desc->GetInputsSize()); + return FAILED; + } + + for (NodePtr &node : graph->GetDirectNode()) { + GE_CHECK_NOTNULL(node); + GE_CHECK_NOTNULL(node->GetOpDesc()); + if (node->GetType() != DATA_TYPE) { + continue; + } + + // Find functional node input anchor. + uint32_t parent_index = 0; + if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGE(FAILED, "Node: %s get attr %s failed", node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + return FAILED; + } + + GELOGI("SubGraph: %s Parent input index: %u.", graph->GetName().c_str(), parent_index); + if (parent_index >= input_offsets.size()) { + GELOGE(FAILED, "SubGraph: %s Parent input size: %zu, parent index: %u.", graph->GetName().c_str(), parent_index, + input_offsets.size()); + return FAILED; + } + + // Find subgraph data input anchor. + OutDataAnchorPtr out_anchor = node->GetOutDataAnchor(kDataOutputIndex); + GE_CHECK_NOTNULL(out_anchor); + + for (InDataAnchorPtr &peer_anchor : out_anchor->GetPeerInDataAnchors()) { + GE_CHECK_NOTNULL(peer_anchor); + const NodePtr &peer_node = peer_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(peer_node); + + vector input_offset = peer_node->GetOpDesc()->GetInputOffset(); + if (peer_anchor->GetIdx() < 0 || input_offset.size() <= static_cast(peer_anchor->GetIdx())) { + GELOGE(FAILED, "SubGraph: %s Node: %s invalid anchor index: %d.", graph->GetName().c_str(), + peer_node->GetName().c_str(), peer_anchor->GetIdx()); + return FAILED; + } + + input_offset[peer_anchor->GetIdx()] = input_offsets[parent_index]; + peer_node->GetOpDesc()->SetInputOffset(input_offset); + } + } + } + + return SUCCESS; +} + +Status GraphMemoryAssigner::AssignSubgraphOutputsMemory() { + GE_CHECK_NOTNULL(compute_graph_); + for (ComputeGraphPtr &graph : compute_graph_->GetAllSubgraphs()) { + GE_CHECK_NOTNULL(graph); + const NodePtr &parent_node = graph->GetParentNode(); + GE_CHECK_NOTNULL(parent_node); + + const NodePtr &net_output_node = graph->FindNode(NODE_NAME_NET_OUTPUT); + GE_CHECK_NOTNULL(net_output_node); + const OpDescPtr &net_output_desc = net_output_node->GetOpDesc(); + GE_CHECK_NOTNULL(net_output_desc); + + const vector input_offsets = net_output_desc->GetInputOffset(); + for (size_t i = 0; i < input_offsets.size(); ++i) { + uint32_t parent_index = 0; + if (!AttrUtils::GetInt(net_output_desc->GetInputDesc(i), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGW("SubGraph: %s input tensor %zu attr %s not found.", graph->GetName().c_str(), i, + ATTR_NAME_PARENT_NODE_INDEX.c_str()); + continue; + } + + const OutDataAnchorPtr &out_anchor = parent_node->GetOutDataAnchor(parent_index); + GE_CHECK_NOTNULL(out_anchor); + for (InDataAnchorPtr &peer_anchor : out_anchor->GetPeerInDataAnchors()) { + GE_CHECK_NOTNULL(peer_anchor); + const NodePtr &peer_node = peer_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(peer_node); + + vector input_offset = peer_node->GetOpDesc()->GetInputOffset(); + if (peer_anchor->GetIdx() < 0 || input_offset.size() <= static_cast(peer_anchor->GetIdx())) { + GELOGE(FAILED, "SubGraph: %s Node: %s invalid anchor index: %d.", graph->GetName().c_str(), + peer_node->GetName().c_str(), peer_anchor->GetIdx()); + return FAILED; + } + + input_offset[peer_anchor->GetIdx()] = input_offsets[i]; + peer_node->GetOpDesc()->SetInputOffset(input_offset); + } + } + } + + return SUCCESS; +} + Status GraphMemoryAssigner::AssignReferenceMemory(const ge::NodePtr &node) { + GELOGI("Current node %s needs to support the reference relationship between output and input.", + node->GetName().c_str()); + auto out_op_desc = node->GetOpDesc(); GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED); vector output_list = out_op_desc->GetOutputOffset(); @@ -532,6 +791,8 @@ Status GraphMemoryAssigner::AssignReferenceMemory(const ge::NodePtr &node) { auto iter = input_name_index.find(out_data_anchor_name); if (iter != input_name_index.end()) { int index = iter->second; + GELOGI("Reference memory: input anchor index = %d, input anchor name = %s, output anchor name = %s.", index, + iter->first.c_str(), out_data_anchor_name.c_str()); GE_CHECK_NOTNULL(node->GetInDataAnchor(index)); auto peer_out_anchor = node->GetInDataAnchor(index)->GetPeerOutAnchor(); GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); @@ -540,8 +801,11 @@ Status GraphMemoryAssigner::AssignReferenceMemory(const ge::NodePtr &node) { auto peer_out_op_desc = peer_out_node->GetOpDesc(); GE_CHECK_NOTNULL(peer_out_op_desc); output_list[out_data_anchor->GetIdx()] = peer_out_op_desc->GetOutputOffset()[peer_out_anchor_index]; + GELOGI("Reference output : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld]", + node->GetOwnerComputeGraph()->GetName().c_str(), peer_out_op_desc->GetName().c_str(), + out_data_anchor->GetIdx(), output_list[out_data_anchor->GetIdx()], peer_out_op_desc->GetStreamId()); } else { - GELOGD("Reference output : origin %s name[%s] output[%d] offset is [%ld] stream_id[%ld]", + GELOGI("Reference output : origin %s name[%s] output[%d] offset is [%ld] stream_id[%ld]", node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId()); } @@ -620,14 +884,14 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) { // If you have already assigned an atomic address, skip it, and you don't need to reassign it. if (is_assigned_mem) { - GELOGD( + GELOGI( "[IMAS]Atomic output : we have assigned atomic memory as the input of next node in " "ReAssignContinuousMemory function."); continue; } auto output_desc = op_desc->GetAllOutputsDescPtr().at(output_index); - uint32_t size = 0; + int64_t size = 0; if (ge::TensorUtils::GetSize(*output_desc, size) != SUCCESS) { GELOGI("Get size failed"); } @@ -635,7 +899,7 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) { output_list[output_index] = memory_offset_[0].mem_offset_; memory_offset_[0].mem_offset_ += size; - AlignMemOffset(kMemAlignSize); + AlignMemOffset(MEM_ALIGN_SIZE); } op_desc->SetOutputOffset(output_list); @@ -751,6 +1015,57 @@ ge::Status GraphMemoryAssigner::SetInputOffset() { return ge::SUCCESS; } +ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector &input_list) const { + vector origin_input_list; + vector memory_type; + auto tmp_op_desc = node->GetOpDesc(); + origin_input_list = tmp_op_desc->GetInputOffset(); + bool has_mem_type_attr = ge::AttrUtils::GetListInt(tmp_op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, memory_type); + for (const auto &anchor : node->GetAllInDataAnchors()) { + vector output_list; + auto peer_out_anchor = anchor->GetPeerOutAnchor(); + if (peer_out_anchor == nullptr) { + continue; + } + + // If the current node not broadcast, the OutputOffset of the previous node is used to update the input_list + auto last_peer_out_node = peer_out_anchor->GetOwnerNode(); + auto last_peer_out_op_desc = last_peer_out_node->GetOpDesc(); + GE_CHECK_NOTNULL(last_peer_out_op_desc); + output_list = last_peer_out_op_desc->GetOutputOffset(); + if (output_list.size() > static_cast(peer_out_anchor->GetIdx())) { + auto input_index = anchor->GetIdx(); + if (has_mem_type_attr) { + auto input_size = tmp_op_desc->GetInputsSize(); + auto ori_input_offset_list_size = origin_input_list.size(); + auto mem_type_size = memory_type.size(); + if ((input_size != mem_type_size) || (input_size != ori_input_offset_list_size)) { + GELOGE(ge::FAILED, + "L1fusion: input_size[%zu] diff from memory_type_size[%zu]" + " from ori_input_offset_list_size[%lu]", + input_size, mem_type_size, ori_input_offset_list_size); + return ge::FAILED; + } + // l1 keep orignal inputoffest + // hbm inputoffset = original inputoffset + outputoffset + input_list.emplace_back(memory_type[input_index] != RT_MEMORY_HBM + ? origin_input_list[input_index] + : origin_input_list[input_index] + output_list.at(peer_out_anchor->GetIdx())); + GELOGI("L1 fuison: node[%s] input[%d] is set from node[%s] out index[%d] offset[%ld]", + tmp_op_desc->GetName().c_str(), input_index, + peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), peer_out_anchor->GetIdx(), + input_list.back()); + } else { + GELOGI("node[%s] input[%d] is set from node[%s] out index[%d] offset[%ld]", tmp_op_desc->GetName().c_str(), + input_index, peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), peer_out_anchor->GetIdx(), + output_list.at(peer_out_anchor->GetIdx())); + input_list.emplace_back(output_list.at(peer_out_anchor->GetIdx())); + } + } + } + return ge::SUCCESS; +} + ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { vector input_list; if (node->GetType() == HCOMBROADCAST) { @@ -783,22 +1098,8 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { } } } else { - for (const auto &anchor : node->GetAllInDataAnchors()) { - vector output_list; - auto peer_out_anchor = anchor->GetPeerOutAnchor(); - if (peer_out_anchor == nullptr) { - continue; - } - - // If the current node not broadcast, the OutputOffset of the previous node is used to update the input_list - auto last_peer_out_node = peer_out_anchor->GetOwnerNode(); - auto last_peer_out_op_desc = last_peer_out_node->GetOpDesc(); - GE_CHECK_NOTNULL(last_peer_out_op_desc); - output_list = last_peer_out_op_desc->GetOutputOffset(); - if (output_list.size() > static_cast(peer_out_anchor->GetIdx())) { - input_list.emplace_back(output_list.at(peer_out_anchor->GetIdx())); - } - } + GE_CHK_STATUS_EXEC(UpdateOpInputOffset(node, input_list), GELOGE(FAILED, "UpdateOpInputOffset fail."); + return ge::FAILED); } GE_CHECK_NOTNULL(node->GetOpDesc()); node->GetOpDesc()->SetInputOffset(input_list); diff --git a/src/ge/graph/build/memory/graph_mem_assigner.h b/src/ge/graph/build/memory/graph_mem_assigner.h index 0c26c070..e46d4f8b 100644 --- a/src/ge/graph/build/memory/graph_mem_assigner.h +++ b/src/ge/graph/build/memory/graph_mem_assigner.h @@ -93,6 +93,10 @@ class GraphMemoryAssigner { /// ge::Status AssignVarAttr2Nodes(); + ge::Status AssignSubgraphInputsMemory(); + + ge::Status AssignSubgraphOutputsMemory(); + ge::Status ReAssignMemory(bool is_loop_graph, size_t &mem_offset); ge::Status SetInputOffset(); @@ -109,7 +113,12 @@ class GraphMemoryAssigner { /// ge::Status ReAssignContinuousMemory(bool is_loop_graph); - ge::Status ReAssignVirtualConcatMemory(); + ge::Status ReAssignReuseAndNoPaddingContinuousInputMemory(); + + ge::Status ReAssignReuseAndNoPaddingContinuousOutputMemory(); + + ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index, + int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size); ge::Status ReAssignMergeMemory(); @@ -147,6 +156,8 @@ class GraphMemoryAssigner { void AlignMemOffset(const int64_t &mem_align_size); + ge::Status UpdateOpInputOffset(const NodePtr &node, vector &input_list) const; + MemoryOffsetList memory_offset_; ge::ComputeGraphPtr compute_graph_; }; diff --git a/src/ge/graph/build/memory/memory_assigner.cc b/src/ge/graph/build/memory/memory_assigner.cc index 92858b49..75ab01b4 100644 --- a/src/ge/graph/build/memory/memory_assigner.cc +++ b/src/ge/graph/build/memory/memory_assigner.cc @@ -21,36 +21,41 @@ namespace ge { Status MemoryAssigner::AssignMemory(bool is_loop_graph, size_t &mem_offset) { - std::unique_ptr graph_mem_assigner(new (std::nothrow) - ge::GraphMemoryAssigner(compute_graph_)); - if (graph_mem_assigner == nullptr) { - GELOGE(ge::FAILED, "Alloc GraphMemoryAssigner failed."); - return ge::FAILED; - } + GraphMemoryAssigner graph_mem_assigner(compute_graph_); - if (graph_mem_assigner->AssignMemory() != ge::SUCCESS) { + if (graph_mem_assigner.AssignMemory() != ge::SUCCESS) { GELOGE(ge::FAILED, "Memory assigner failed"); return ge::FAILED; } // Reassign memory for special nodes - if (graph_mem_assigner->ReAssignMemory(is_loop_graph, mem_offset) != ge::SUCCESS) { + if (graph_mem_assigner.ReAssignMemory(is_loop_graph, mem_offset) != ge::SUCCESS) { GELOGE(ge::FAILED, "Memory assigner failed"); return ge::FAILED; } // Must do variable attr assign after all the memory assigned - if (graph_mem_assigner->AssignVarAttr2Nodes() != SUCCESS) { + if (graph_mem_assigner.AssignVarAttr2Nodes() != SUCCESS) { GELOGE(FAILED, "Variable Memory assigner failed"); return FAILED; } - if (graph_mem_assigner->SetInputOffset() != ge::SUCCESS) { + if (graph_mem_assigner.SetInputOffset() != ge::SUCCESS) { GELOGE(ge::FAILED, "SetInputOffset Fail!"); return ge::FAILED; } - if (graph_mem_assigner->CheckOffset() != SUCCESS) { + if (graph_mem_assigner.AssignSubgraphInputsMemory() != SUCCESS) { + GELOGE(FAILED, "Assign subgraph inputs memory failed"); + return FAILED; + } + + if (graph_mem_assigner.AssignSubgraphOutputsMemory() != SUCCESS) { + GELOGE(FAILED, "Assign subgraph inputs memory failed"); + return FAILED; + } + + if (graph_mem_assigner.CheckOffset() != SUCCESS) { GELOGE(FAILED, "CheckOffset Fail!"); return FAILED; } diff --git a/src/ge/graph/build/memory/var_mem_assign_util.cc b/src/ge/graph/build/memory/var_mem_assign_util.cc index 25d575c3..0a2061f8 100644 --- a/src/ge/graph/build/memory/var_mem_assign_util.cc +++ b/src/ge/graph/build/memory/var_mem_assign_util.cc @@ -128,6 +128,7 @@ Status VarMemAssignUtil::SetOutVariableAttr(const ge::NodePtr &node, const ge::N GE_CHK_BOOL_RET_STATUS(index < out_list_size, FAILED, "index %d >= output_list.size() %d", index, out_list_size); output_list[index] = static_cast(reinterpret_cast(dev_ptr)); + GELOGI("Assign node outputOffset[index] is: %ld", output_list[index]); node->GetOpDesc()->SetOutputOffset(output_list); return SUCCESS; @@ -171,7 +172,7 @@ Status VarMemAssignUtil::DealBroadCastNode(uint32_t graph_id, const ge::NodePtr input_tensor_desc_ptr_vistor.size(), broad_cast_info.idx); const ge::GeTensorDescPtr input_tensor_desc = input_tensor_desc_ptr_vistor.at(static_cast(broad_cast_info.idx)); - uint32_t input_size = 0; + int64_t input_size = 0; GE_CHK_STATUS(TensorUtils::GetSize(*input_tensor_desc, input_size), "get input size failed."); broad_cast_info.input_size = input_size; @@ -190,7 +191,7 @@ Status VarMemAssignUtil::DealBroadCastNode(uint32_t graph_id, const ge::NodePtr output_tensor_desc_ptr_vistor.size(), broad_cast_info.idx); const ge::GeTensorDescPtr output_tensor_desc = output_tensor_desc_ptr_vistor.at(static_cast(broad_cast_info.idx)); - uint32_t output_size = 0; + int64_t output_size = 0; GE_CHK_STATUS(TensorUtils::GetSize(*output_tensor_desc, output_size), "get input size failed."); broad_cast_info.output_size = output_size; GE_CHK_BOOL_RET_STATUS(broad_cast_info.output_size == broad_cast_info.input_size, FAILED, diff --git a/src/ge/graph/build/model_builder.cc b/src/ge/graph/build/model_builder.cc index fd4ffe8b..af641dcc 100644 --- a/src/ge/graph/build/model_builder.cc +++ b/src/ge/graph/build/model_builder.cc @@ -17,11 +17,13 @@ #include "graph/build/model_builder.h" #include #include +#include #include "common/ge/ge_util.h" #include "framework/common/debug/ge_log.h" #include "graph/anchor.h" #include "graph/attr_value.h" #include "graph/buffer.h" +#include "graph/build/label_allocator.h" #include "graph/build/stream_allocator.h" #include "graph/common/omg_util.h" #include "graph/debug/ge_attr_define.h" @@ -37,14 +39,12 @@ #include "graph/utils/op_desc_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" +#include "graph/ge_context.h" #include "init/gelib.h" #include "memory/memory_assigner.h" #include "omg/version.h" #include "register/op_registry.h" -using domi::AippOpParams; -using domi::DOMI_TENSOR_NC1HWC0; -using domi::ModelTaskDef; using ge::FAILED; using ge::PARAM_INVALID; using ge::SUCCESS; @@ -65,6 +65,10 @@ const int kInvalidIndexNum = -1; const uint32_t kInputDimensions2D = 2; const uint32_t kInputDimensions3D = 3; +const char *const kVectorCore = "VectorCore"; +const char *const kCoreType = "ge.engineType"; +const std::string kEnableL1Fusion = "ge.l1Fusion"; + const set adjust_layer_type_ = {ge::CONVOLUTION}; bool IsGeLocalOp(const ge::ConstOpDescPtr &op_desc) { @@ -90,12 +94,14 @@ ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const vector(desc_temp.GetShape().GetDimNum()); GE_IF_BOOL_EXEC(dim_num > DIM_DEFAULT_SIZE, TensorUtils::SetRealDimCnt(desc_temp, dim_num)); // calculate tensor size - uint32_t size_temp = 0; + int64_t size_temp = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(desc_temp, size_temp); if (graph_status != GRAPH_SUCCESS) { GELOGE(graph_status, "GetTensorMemorySizeInBytes failed!"); @@ -122,7 +128,7 @@ Status ModelBuilder::CalcOutputSize(const ge::NodePtr &n) { return FAILED; } - GELOGD("update output desc, dim_size: %u, mem_size: %u, format: %s, type: %s, node name:%s", dim_num, size_temp, + GELOGD("update output desc, dim_size: %u, mem_size: %ld, format: %s, type: %s, node name:%s", dim_num, size_temp, TypeUtils::FormatToSerialString(desc_temp.GetFormat()).c_str(), TypeUtils::DataTypeToSerialString(desc_temp.GetDataType()).c_str(), node_op_desc->GetName().c_str()); index++; @@ -234,26 +240,13 @@ Status ModelBuilder::SetInputOutputDesc() { ret = AdjustConstWeightSize(n, weight_offset_); GE_CHK_STATUS_RET(ret, "AdjustConstWeightSize failed"); - GE_IF_BOOL_EXEC(((weight_offset_ > 0) && (weight_offset_ % kMemAlignSize != 0)), - weight_offset_ = (weight_offset_ + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize); + GE_IF_BOOL_EXEC(((weight_offset_ > 0) && (weight_offset_ % MEM_ALIGN_SIZE != 0)), + weight_offset_ = (weight_offset_ + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE); } GE_CHK_STATUS_RET(compute_graph_->TopologicalSorting(), "TopologicalSorting failed"); return SUCCESS; } -Status ModelBuilder::AssignMemory() { - std::unique_ptr mem_assigner(new (std::nothrow) ge::MemoryAssigner(compute_graph_)); - if (mem_assigner == nullptr) { - GELOGE(FAILED, "new memory allocator failed."); - return FAILED; - } - if (mem_assigner->AssignMemory(is_loop_graph_, mem_offset_) != SUCCESS) { - GELOGE(FAILED, "memory allocator failed."); - return FAILED; - } - return SUCCESS; -} - void ModelBuilder::AddNodeInputProperty() { for (const ge::NodePtr &node : compute_graph_->GetDirectNode()) { auto node_op_desc = node->GetOpDesc(); @@ -262,7 +255,7 @@ void ModelBuilder::AddNodeInputProperty() { vector src_index_list; for (const auto &in_data_anchor : node->GetAllInDataAnchors()) { auto peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); - GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, GELOGW("peer_out_anchor is nullptr!"); continue); GE_IF_BOOL_EXEC(node_op_desc->HasAttr(MERGE_PRENODE_FLAG), continue); ge::NodePtr src_node = peer_out_anchor->GetOwnerNode(); @@ -341,6 +334,16 @@ Status ModelBuilder::AdjustInputTensorFlag() { } return SUCCESS; } +void ModelBuilder::InitL1FusionOption() { + string is_l1_fusion_enable = "false"; + graphStatus ret = ge::GetContext().GetOption(kEnableL1Fusion, is_l1_fusion_enable); + if (ret == GRAPH_SUCCESS) { + is_l1_fusion_enable_ = is_l1_fusion_enable == "true"; + GELOGD("The value of %s is %s.", kEnableL1Fusion.c_str(), is_l1_fusion_enable.c_str()); + } else { + GELOGW("The value of %s is empty.", kEnableL1Fusion.c_str()); + } +} Status ModelBuilder::BuildModelDef(ge::Model &model) { ClearOriginalFormat(); @@ -358,6 +361,23 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) { GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_EVENT_NUM, event_num_), GELOGE(FAILED, "SetInt of ATTR_MODEL_EVENT_NUM failed."); return FAILED); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_LABEL_NUM, label_num_), + GELOGE(FAILED, "SetInt of ATTR_MODEL_LABEL_NUM failed."); + return FAILED); + string ge_core_type; + Status ret = ge::GetContext().GetOption(kCoreType, ge_core_type); + if (ret != SUCCESS) { + GELOGW("get the option CORE_TYPE fail, set it to default value VECTOR_ENGINE"); + } + int64_t core_type = (ge_core_type == kVectorCore) ? 1 : 0; + GELOGI("core_type: %ld", core_type); + if (!ge::AttrUtils::SetInt(&model, ATTR_MODEL_CORE_TYPE, core_type)) { + GELOGE(FAILED, "SetInt of ATTR_CORE_TYPE failed."); + } + InitL1FusionOption(); + GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(&model, ATTR_NAME_SWITCH_FOR_L1_FUSION, is_l1_fusion_enable_), + GELOGE(FAILED, "SetBool of ATTR_NAME_SWITCH_FOR_L1_FUSION failed."); + return FAILED); model.SetName(compute_graph_->GetName()); model.SetGraph(ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph_)); @@ -485,7 +505,7 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) { return INTERNAL_ERROR; } int byte_size = static_cast(task_def_bytes.GetSize()); - std::shared_ptr task = ge::MakeShared(); + std::shared_ptr task = ge::MakeShared(); GE_CHECK_NOTNULL(task); GE_CHK_BOOL_EXEC(ReadProtoFromArray(task_def_bytes.GetData(), byte_size, task.get()), return INTERNAL_ERROR, "ReadProtoFromArray failed."); @@ -533,19 +553,39 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) { // Assign logical streams. StreamAllocator stream_allocator(compute_graph_, subgraphs_); + GE_TIMESTAMP_START(AssignLogicalStreams); GE_CHK_STATUS_RET(stream_allocator.AssignLogicalStreams(stream_max_parallel_num_, hcom_parallel_), "Assign logical streams failed."); + GE_TIMESTAMP_END(AssignLogicalStreams, "GraphBuilder::AssignLogicalStreams"); - GE_CHK_STATUS_RET(AssignMemory(), "Assign Memory Failed!"); + // Assign functional op labels. + GE_TIMESTAMP_START(AssignFunctionalLabels); + LabelAllocator label_allocator(compute_graph_); + GE_CHK_STATUS_RET(label_allocator.AssignFunctionalLabels(label_num_), "Assign label failed."); + GE_TIMESTAMP_END(AssignFunctionalLabels, "ModelBuilder::AssignFunctionalLabels"); + + GE_TIMESTAMP_START(AssignMemory); + MemoryAssigner mem_assigner(compute_graph_); + GE_CHK_STATUS_RET(mem_assigner.AssignMemory(is_loop_graph_, mem_offset_), "Assign Memory Failed!"); + GE_TIMESTAMP_END(AssignMemory, "GraphBuilder::AssignMemory"); // Compile single op in graph build stage + GE_TIMESTAMP_START(CompileSingleOp); GE_CHK_STATUS_RET(CompileSingleOp(), "ATC builder CompileSingleOp() return fail."); + GE_TIMESTAMP_END(CompileSingleOp, "GraphBuilder::CompileSingleOp"); // Refresh real streams and insert event nodes. + GE_TIMESTAMP_START(RefreshRealStream); GE_CHK_STATUS_RET(stream_allocator.RefreshRealStream(stream_num_, event_num_), "RefreshRealStream failed."); + GE_TIMESTAMP_END(RefreshRealStream, "GraphBuilder::RefreshRealStream"); + GE_TIMESTAMP_START(MergeWeights); GE_CHK_STATUS_RET(MergeWeights(), "MergeWeights Failed!"); + GE_TIMESTAMP_END(MergeWeights, "GraphBuilder::MergeWeights"); + + GE_TIMESTAMP_START(BuildModelDef); GE_CHK_STATUS_RET(BuildModelDef(model), "BuildModelDef failed!"); + GE_TIMESTAMP_END(BuildModelDef, "GraphBuilder::BuildModelDef"); SetModelVersion(model); @@ -562,7 +602,6 @@ Status ModelBuilder::CompileSingleOp() { return ge::GE_CLI_GE_NOT_INITIALIZED; } - GE_TIMESTAMP_CALLNUM_START(CheckAccuracySupported); GE_TIMESTAMP_CALLNUM_START(BatchCompileOp); std::unordered_map> node_vector_map; for (auto &node : compute_graph_->GetAllNodes()) { @@ -610,7 +649,6 @@ Status ModelBuilder::CompileSingleOp() { } } GE_TIMESTAMP_CALLNUM_END(BatchCompileOp, "GraphBuild::CompileOp"); - GE_TIMESTAMP_CALLNUM_END(CheckAccuracySupported, "GraphBuild::CheckAccuracySupported"); return ge::SUCCESS; } } // namespace ge diff --git a/src/ge/graph/build/model_builder.h b/src/ge/graph/build/model_builder.h index b6eee6aa..4bf03bdc 100644 --- a/src/ge/graph/build/model_builder.h +++ b/src/ge/graph/build/model_builder.h @@ -54,8 +54,6 @@ class ModelBuilder { ge::Buffer GetWeightBuffer() const; protected: - Status AssignMemory(); - void AddNodeInputProperty(); void ClearOriginalFormat(); @@ -77,6 +75,8 @@ class ModelBuilder { Status BuildModelDef(ge::Model &model_def); + void InitL1FusionOption(); + Status CompileSingleOp(); size_t mem_offset_; @@ -91,6 +91,8 @@ class ModelBuilder { int64_t event_num_; + uint32_t label_num_; + ge::Buffer weight_buffer_; std::map stream_max_parallel_num_; @@ -103,6 +105,7 @@ class ModelBuilder { uint8_t platform_type_; bool is_loop_graph_; + bool is_l1_fusion_enable_; }; } // namespace ge #endif // GE_GRAPH_BUILD_MODEL_BUILDER_H_ diff --git a/src/ge/graph/build/run_context.cc b/src/ge/graph/build/run_context.cc index fa13c898..e3230f5e 100644 --- a/src/ge/graph/build/run_context.cc +++ b/src/ge/graph/build/run_context.cc @@ -15,13 +15,13 @@ */ #include "graph/build/run_context.h" -#include "framework/common/debug/ge_log.h" #include "common/util.h" +#include "framework/common/debug/ge_log.h" #include "graph/debug/ge_attr_define.h" namespace ge { -RunContextUtil::~RunContextUtil() { DestroyRtModelStreamAndEvents(); } +RunContextUtil::~RunContextUtil() { DestroyRtModelResources(); } Status RunContextUtil::InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size, uint8_t *weight_mem_base, uint64_t weight_mem_size) { @@ -40,7 +40,7 @@ Status RunContextUtil::InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_siz return SUCCESS; } -Status RunContextUtil::CreateRtModelStreamsAndEvents(uint32_t stream_num, uint32_t event_num) { +Status RunContextUtil::CreateRtModelResources(uint32_t stream_num, uint32_t event_num, uint32_t label_num) { // Create rt model rtError_t rt_ret = rtModelCreate(&rt_model_, 0); if (rt_ret != RT_ERROR_NONE) { @@ -75,10 +75,22 @@ Status RunContextUtil::CreateRtModelStreamsAndEvents(uint32_t stream_num, uint32 } event_list_.emplace_back(event); } + + // Create rt label + for (uint32_t i = 0; i < label_num; ++i) { + rtLabel_t label = nullptr; + rt_ret = rtLabelCreate(&label); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "rtLabelCreate failed. rt_ret = %d, index = %u", static_cast(rt_ret), i); + return RT_FAILED; + } + label_list_.emplace_back(label); + } + return SUCCESS; } -void RunContextUtil::DestroyRtModelStreamAndEvents() noexcept { +void RunContextUtil::DestroyRtModelResources() noexcept { rtError_t rt_ret; for (size_t i = 0; i < stream_list_.size(); i++) { // Unbind stream to model first @@ -98,6 +110,14 @@ void RunContextUtil::DestroyRtModelStreamAndEvents() noexcept { } event_list_.clear(); + for (size_t i = 0; i < label_list_.size(); ++i) { + rt_ret = rtLabelDestroy(label_list_[i]); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("Destroy label failed. rt_ret = %d, index = %zu.", static_cast(rt_ret), i); + } + } + label_list_.clear(); + if (rt_model_ != nullptr) { rt_ret = rtModelDestroy(rt_model_); if (rt_ret != RT_ERROR_NONE) { @@ -130,16 +150,22 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra } GELOGI("Event_num = %u", event_num); - Status ret = CreateRtModelStreamsAndEvents(stream_num, event_num); + uint32_t label_num = 0; + if (!AttrUtils::GetInt(&model, ATTR_MODEL_LABEL_NUM, label_num)) { + GELOGE(INTERNAL_ERROR, "Get label_num attr from model failed. session_id=%lu", session_id); + return INTERNAL_ERROR; + } + GELOGI("Label_num = %u", label_num); + + Status ret = CreateRtModelResources(stream_num, event_num, label_num); if (ret != SUCCESS) { - GELOGE(ret, "CreateRtModelStreamsAndEvents failed. session_id=%lu", session_id); - DestroyRtModelStreamAndEvents(); + GELOGE(ret, "CreateRtModelResources failed. session_id=%lu", session_id); + DestroyRtModelResources(); return ret; } - run_context_ = {rt_model_, nullptr, session_id, data_mem_size_, data_mem_base_, - weight_mem_size_, weight_mem_base_, buffer, stream_list_, event_list_}; - + run_context_ = {rt_model_, nullptr, session_id, data_mem_size_, data_mem_base_, weight_mem_size_, + weight_mem_base_, buffer, stream_list_, event_list_, label_list_}; return SUCCESS; } diff --git a/src/ge/graph/build/run_context.h b/src/ge/graph/build/run_context.h index 90579c77..5b24f343 100644 --- a/src/ge/graph/build/run_context.h +++ b/src/ge/graph/build/run_context.h @@ -33,26 +33,28 @@ class RunContextUtil { virtual ~RunContextUtil(); // Init mem info. - Status InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size, uint8_t *weight_mem_base, - uint64_t weight_mem_size); + ge::Status InitMemInfo(uint8_t *data_mem_base, uint64_t data_mem_size, uint8_t *weight_mem_base, + uint64_t weight_mem_size); - Status CreateRunContext(Model &model_def, const ComputeGraphPtr &graph, Buffer &buffer, const uint64_t session_id); + ge::Status CreateRunContext(Model &model_def, const ComputeGraphPtr &graph, Buffer &buffer, + const uint64_t session_id); RunContext &GetRunContext(); RunContext run_context_; private: - // Create Rt model/stream/event for task generate - Status CreateRtModelStreamsAndEvents(uint32_t stream_num, uint32_t event_num); + // Create Rt model/stream/event/label for task generate + ge::Status CreateRtModelResources(uint32_t stream_num, uint32_t event_num, uint32_t label_num); - // Destroy Rt model/stream/event - void DestroyRtModelStreamAndEvents() noexcept; + // Destroy Rt model/stream/event/label + void DestroyRtModelResources() noexcept; // Model rtModel_t rt_model_ = nullptr; std::vector stream_list_; std::vector event_list_; + std::vector label_list_; // Mem info uint8_t *data_mem_base_ = nullptr; diff --git a/src/ge/graph/build/stream_allocator.cc b/src/ge/graph/build/stream_allocator.cc index 5fbd3693..ffcc2315 100644 --- a/src/ge/graph/build/stream_allocator.cc +++ b/src/ge/graph/build/stream_allocator.cc @@ -42,8 +42,8 @@ namespace ge { Status StreamAllocator::AssignLogicalStreams(const std::map &max_parallel_num, bool hcom_parallel) { GELOGI("AssignLogicalStreams start."); GE_CHECK_NOTNULL(whole_graph_); - GraphUtils::DumpGEGraph(whole_graph_, "BeforeAssignedLogicalStreams_whole_graph"); - GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "BeforeAssignedLogicalStreams_whole_graph"); + GraphUtils::DumpGEGraph(whole_graph_, "BeforeAssignedLogicalStreams"); + GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "BeforeAssignedLogicalStreams"); auto gelib = GELib::GetInstance(); if (gelib == nullptr) { @@ -60,8 +60,8 @@ Status StreamAllocator::AssignLogicalStreams(const std::map &m return status; } - GraphUtils::DumpGEGraph(whole_graph_, "AfterAssignedLogicalStreams_whole_graph"); - GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "AfterAssignedLogicalStreams_whole_graph"); + GraphUtils::DumpGEGraph(whole_graph_, "AfterAssignedLogicalStreams"); + GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "AfterAssignedLogicalStreams"); GELOGI("AssignLogicalStreams success."); return SUCCESS; @@ -124,7 +124,7 @@ Status StreamAllocator::RefreshRealStream(int64_t &stream_num, int64_t &event_nu GraphUtils::DumpGEGraph(whole_graph_, "RefreshRealStream"); GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "RefreshRealStream"); - for (const NodePtr &node : whole_graph_->GetDirectNode()) { + for (const NodePtr &node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); auto stream_id = node->GetOpDesc()->GetStreamId(); if (stream_id == kInvalidStream) { @@ -169,7 +169,7 @@ Status StreamAllocator::SplitStreams() { pre_node_vec[i] = nullptr; } - for (const auto &cur_node : whole_graph_->GetDirectNode()) { + for (const auto &cur_node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(cur_node->GetOpDesc()); int64_t stream_id = cur_node->GetOpDesc()->GetStreamId(); if (stream_id == kInvalidStream) { @@ -225,12 +225,11 @@ Status StreamAllocator::SplitStreams() { if (last_stream_id >= 0) { stream_num_ = last_stream_id + 1; } - return UpdateActiveStreams(split_streams); } Status StreamAllocator::UpdateActiveStreams(vector> &split_streams) { - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { vector active_streams; GE_CHECK_NOTNULL(node->GetOpDesc()); if (AttrUtils::GetListInt(node->GetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, active_streams)) { @@ -260,7 +259,7 @@ Status StreamAllocator::UpdateActiveStreams(vector> &split_streams) Status StreamAllocator::ActiveStreamsBySpecificLabels() { // > map> labeled_streams; - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { OpDescPtr op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); string stream_label; @@ -272,7 +271,7 @@ Status StreamAllocator::ActiveStreamsBySpecificLabels() { } } - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); vector activated_label_list; if (!AttrUtils::GetListStr(node->GetOpDesc(), ATTR_NAME_ACTIVE_LABEL_LIST, activated_label_list) || @@ -306,7 +305,7 @@ Status StreamAllocator::ActiveStreamsForLoop() { } } // Set the stream that needs to be activated - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); bool is_loop_active = false; if (AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_IS_LOOP_ACTIVE, is_loop_active) && is_loop_active) { @@ -329,7 +328,7 @@ Status StreamAllocator::ActiveStreamsForLoop() { } Status StreamAllocator::CheckStreamActived() const { - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); vector active_streams; if (AttrUtils::GetListInt(node->GetOpDesc(), ATTR_NAME_ACTIVE_STREAM_LIST, active_streams)) { @@ -347,7 +346,7 @@ Status StreamAllocator::CheckStreamActived() const { // Insert the send/recv event id to the graph Status StreamAllocator::InsertSyncEvents() { - for (const auto &cur_node : whole_graph_->GetDirectNode()) { + for (const auto &cur_node : whole_graph_->GetAllNodes()) { // Take the adjacent points, then judge whether need to insert the event for (const OutDataAnchorPtr &anchor : cur_node->GetAllOutDataAnchors()) { for (const InDataAnchorPtr &peer_in_anchor : anchor->GetPeerInDataAnchors()) { @@ -427,7 +426,7 @@ Status StreamAllocator::InsertOneEventInTwoNodes(const NodePtr &cur_node, const Status StreamAllocator::OptimizeSyncEvents() { map> stream_nodes; - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); int64_t stream_id = node->GetOpDesc()->GetStreamId(); stream_nodes[stream_id].emplace_back(node); @@ -484,8 +483,8 @@ Status StreamAllocator::OptimizeBySendEvents(const map> if (send_node_to_event_id.find(send_node_ptr) != send_node_to_event_id.end()) { RmvSendEventId(send_node_ptr, event_id); RmvRecvEventId(recv_node_ptr, event_id); - GELOGI("Remove send event %u for node: %s", event_id, send_node_ptr->GetName().c_str()); - GELOGI("Remove recv event %u for node: %s", event_id, recv_node_ptr->GetName().c_str()); + GELOGI("Remove event %u between node %s and node %s", event_id, send_node_ptr->GetName().c_str(), + recv_node_ptr->GetName().c_str()); } else { send_node_to_event_id[send_node_ptr] = event_id; } @@ -603,6 +602,8 @@ Status StreamAllocator::OptimizeByStreamActivate() { if (IsRecvNodeActivatedBySendNode(send_node_ptr, recv_node_ptr)) { RmvSendEventId(send_node_ptr, event_id); RmvRecvEventId(recv_node_ptr, event_id); + GELOGI("Remove event %u between node %s and node %s.", event_id, send_node_ptr->GetName().c_str(), + recv_node_ptr->GetName().c_str()); } } } @@ -654,7 +655,7 @@ Status StreamAllocator::RefreshContinuousEvents() { // Insert the real send/recv node in the graph Status StreamAllocator::InsertSyncEventNodes() { - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { // Add the node corresponding to the recv event vector recv_event_id_list; GetRecvEventIdList(node, recv_event_id_list); @@ -682,7 +683,7 @@ Status StreamAllocator::InsertSyncEventNodes() { return status; } - GELOGI("Add recv %u before node: %s", event_id, node->GetName().c_str()); + GELOGI("Insert recv event %u before node: %s", event_id, node->GetName().c_str()); } // Add the node corresponding to the send event @@ -710,7 +711,7 @@ Status StreamAllocator::InsertSyncEventNodes() { return status; } - GELOGI("Add send event %u after node: %s", event_id, node->GetName().c_str()); + GELOGI("Insert send event %u after node: %s", event_id, node->GetName().c_str()); } } @@ -813,7 +814,7 @@ NodePtr StreamAllocator::GetNodeFromRecvEventId(uint32_t recv_event_id) const { void StreamAllocator::DumpEvents() { map> after_refresh_stream_nodes; - for (const auto &node : whole_graph_->GetDirectNode()) { + for (const auto &node : whole_graph_->GetAllNodes()) { GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, continue); int64_t stream_id = node->GetOpDesc()->GetStreamId(); after_refresh_stream_nodes[stream_id].emplace_back(node); @@ -854,7 +855,7 @@ Status StreamAllocator::AddActiveEntryStream() { // Collect streams active by StreamSwitch/StreamActive node. std::set deactive_stream; - for (ge::NodePtr &node : whole_graph_->GetDirectNode()) { + for (ge::NodePtr &node : whole_graph_->GetAllNodes()) { GE_CHECK_NOTNULL(node->GetOpDesc()); Status ret = CollectDeactiveStream(node->GetOpDesc(), deactive_stream); if (ret != SUCCESS) { diff --git a/src/ge/graph/build/optimize_stream_graph.cc b/src/ge/graph/build/stream_graph_optimizer.cc similarity index 80% rename from src/ge/graph/build/optimize_stream_graph.cc rename to src/ge/graph/build/stream_graph_optimizer.cc index f6cc5071..6e0211de 100644 --- a/src/ge/graph/build/optimize_stream_graph.cc +++ b/src/ge/graph/build/stream_graph_optimizer.cc @@ -14,15 +14,9 @@ * limitations under the License. */ -#include "graph/build/optimize_stream_graph.h" - -#include -#include -#include -#include +#include "stream_graph_optimizer.h" #include "common/util.h" #include "framework/common/debug/ge_log.h" - #include "graph/utils/node_utils.h" #include "graph/utils/tensor_utils.h" #include "init/gelib.h" @@ -33,9 +27,9 @@ namespace { static const int64_t kInvalidStream = -1; } // namespace namespace ge { -OptimizeStreamGraph::~OptimizeStreamGraph() {} +StreamGraphOptimizer::~StreamGraphOptimizer() {} -void OptimizeStreamGraph::RefreshNodeId(const ComputeGraphPtr &comp_graph, vector &subgraph_infos) { +void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, vector &subgraph_infos) { size_t node_size = comp_graph->GetDirectNodesSize(); GELOGI("Refresh placeholder and end nodeId start from node num: %zu", node_size); for (const auto &sub_graph_info : subgraph_infos) { @@ -43,7 +37,7 @@ void OptimizeStreamGraph::RefreshNodeId(const ComputeGraphPtr &comp_graph, vecto if (sub_graph == nullptr) { continue; } - for (ge::NodePtr &node : sub_graph->GetAllNodes()) { + for (ge::NodePtr &node : sub_graph->GetDirectNode()) { GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return ); if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) { node->GetOpDesc()->SetId(static_cast(node_size)); @@ -53,12 +47,12 @@ void OptimizeStreamGraph::RefreshNodeId(const ComputeGraphPtr &comp_graph, vecto } } -bool OptimizeStreamGraph::IsSameStreamId(const ComputeGraphPtr &comp_graph) { +bool StreamGraphOptimizer::IsSameStreamId(const ComputeGraphPtr &comp_graph) { if (comp_graph == nullptr) { return false; } std::set stream_set; - for (const ge::NodePtr &cur_node : comp_graph->GetAllNodes()) { + for (const ge::NodePtr &cur_node : comp_graph->GetDirectNode()) { GE_IF_BOOL_EXEC(cur_node->GetOpDesc() == nullptr, continue); int64_t stream_id = cur_node->GetOpDesc()->GetStreamId(); if (stream_id == kInvalidStream) { @@ -69,16 +63,16 @@ bool OptimizeStreamGraph::IsSameStreamId(const ComputeGraphPtr &comp_graph) { stream_set.insert(stream_id); } if (stream_set.size() > 1) { - GELOGD("Nodes of graph: %s have different stream id, node num: %zu, different stream num: %zu.", + GELOGI("Nodes of graph: %s have different stream id, node num: %zu, different stream num: %zu.", comp_graph->GetName().c_str(), comp_graph->GetDirectNodesSize(), stream_set.size()); return false; } return true; } -Status OptimizeStreamGraph::OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, - vector &subgraph_infos, - struct RunContext &run_context) { +Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, + vector &subgraph_infos, + struct RunContext &run_context) { Status ret = SUCCESS; GELOGI("Begin to Get optimize streamed subgraph."); @@ -98,13 +92,15 @@ Status OptimizeStreamGraph::OptimizeStreamedSubGraph(const ComputeGraphPtr &comp vector graph_optimizers; if (instance->DNNEngineManagerObj().IsEngineRegistered(engine_name)) { instance->OpsKernelManagerObj().GetGraphOptimizerByEngine(engine_name, graph_optimizers); + GELOGI("Subgraph: %s start optimize streamed graph. engineName: %s, subgraph num: %zu, graph Optimizer num: %zu.", + sub_graph->GetName().c_str(), engine_name.c_str(), subgraph_infos.size(), graph_optimizers.size()); - auto nodes = sub_graph->GetAllNodes(); + auto nodes = sub_graph->GetDirectNode(); if (nodes.empty()) { continue; } if (!IsSameStreamId(sub_graph)) { - GELOGD("There are more than one stream in subgraph %s", sub_graph->GetName().c_str()); + GELOGI("There are more than one stream in subgraph %s", sub_graph->GetName().c_str()); continue; } OpDescPtr op_desc = nodes.at(0)->GetOpDesc(); @@ -128,7 +124,7 @@ Status OptimizeStreamGraph::OptimizeStreamedSubGraph(const ComputeGraphPtr &comp sub_graph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size(), ret); return ret; } - GELOGD( + GELOGI( "[optimizeStreamedSubGraph]: optimize streamed subgraph success, subgraph: %s, engine_name: %s, graph " "Optimizer num: %zu!", sub_graph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size()); diff --git a/src/ge/graph/build/optimize_stream_graph.h b/src/ge/graph/build/stream_graph_optimizer.h similarity index 85% rename from src/ge/graph/build/optimize_stream_graph.h rename to src/ge/graph/build/stream_graph_optimizer.h index de1ca01c..a65f95f2 100644 --- a/src/ge/graph/build/optimize_stream_graph.h +++ b/src/ge/graph/build/stream_graph_optimizer.h @@ -25,15 +25,15 @@ #include "graph/manager/graph_manager_utils.h" namespace ge { -class OptimizeStreamGraph { +class StreamGraphOptimizer { public: - OptimizeStreamGraph() = default; + StreamGraphOptimizer() = default; - OptimizeStreamGraph(const OptimizeStreamGraph &) = delete; + StreamGraphOptimizer(const StreamGraphOptimizer &) = delete; - OptimizeStreamGraph &operator=(const OptimizeStreamGraph &) = delete; + StreamGraphOptimizer &operator=(const StreamGraphOptimizer &) = delete; - virtual ~OptimizeStreamGraph(); + virtual ~StreamGraphOptimizer(); Status OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, std::vector &subgraph_ptr_list, struct RunContext &run_context); diff --git a/src/ge/graph/build/task_generator.cc b/src/ge/graph/build/task_generator.cc index a192c0d2..e8f6dd26 100644 --- a/src/ge/graph/build/task_generator.cc +++ b/src/ge/graph/build/task_generator.cc @@ -51,6 +51,8 @@ const uint64_t kProfilingBpEndLogid = 2; const uint64_t kProfilingArStartLogid = 3; const uint64_t kProfilingArEndLogid = 4; const uint64_t kProfilingIterEndLogid = 255; +const int64_t kMaxNodeNumInNormalStream = 350; +const int64_t kInvalidGroupId = -1; } // namespace namespace ge { TaskGenerator::TaskGenerator(uint8_t *var_mem_base, uint64_t var_mem_size) { @@ -179,6 +181,57 @@ Status TaskGenerator::UpdateOpIsVarAttr(const OpDescPtr &op_desc, uint64_t sessi return SUCCESS; } +Status TaskGenerator::SaveL1fusionNodes(map> &l1_fusion_nodes, ComputeGraphPtr &graph) { + std::map nodes_with_group_attr; + for (auto &node : graph->GetAllNodes()) { + OpDescPtr op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + int64_t group_id = kInvalidGroupId; + string name = node->GetName(); + string type = node->GetType(); + // For l1 fusion ddb pass, task def must be continuous. + // Part1: store + // If op_desc have this tag, store it in the map firstly, + // call the elements in the map GenerateTask at last + if (ge::AttrUtils::GetInt(op_desc, ATTR_NAME_L1_FUSION_GROUP_ID, group_id)) { + auto stream_id = op_desc->GetStreamId(); + auto group_key = group_id + stream_id * kMaxNodeNumInNormalStream; + (void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_L1_FUSION_GROUP_KEY, group_key); + GELOGI("L1Fusion: store node[name:%s(%s), group id:%ld, group key:%ld, stream_id:%ld] task.", name.c_str(), + type.c_str(), group_id, group_key, op_desc->GetStreamId()); + l1_fusion_nodes[group_key].push_back(node); + nodes_with_group_attr.insert({node, group_id}); + } + + // if node's all in nodes both with same group attr + // and it have no attr or group attr different + // which means bad case, return error + bool call_check = true; + std::unordered_set input_group_ids; + for (const auto &input_node : node->GetInNodes()) { + auto iter = nodes_with_group_attr.find(input_node); + if (iter == nodes_with_group_attr.end()) { + call_check = false; + break; + } else { + input_group_ids.insert(iter->second); + } + } + call_check = (call_check && (input_group_ids.size() == 1)); + if (call_check) { + auto input_group_id = *input_group_ids.begin(); + if (group_id != input_group_id) { + GELOGE(INTERNAL_ERROR, + "L1Fusion: node[name:%s(%s) with group id:%ld and diff from it's input nodes's group id:%ld ", + name.c_str(), type.c_str(), group_id, input_group_id); + return INTERNAL_ERROR; + } + } + } + GELOGI("L1Fusion: get fusion group numbers [%zu].", l1_fusion_nodes.size()); + return SUCCESS; +} + Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &graph, vector &task_def_list, map &op_name_map) { std::shared_ptr ge_lib = GELib::GetInstance(); @@ -186,36 +239,53 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GenerateTask failed."); return GE_CLI_GE_NOT_INITIALIZED; } - - auto ret = MarkFirstAndLastNode(graph); - if (ret != SUCCESS) { - GELOGE(ret, "MarkFirstAndLastNode failed."); - return ret; - } - + GE_CHK_STATUS_RET(MarkNodeAndSetIndex(graph), "MarkFirstAndLastNode failed."); ProfilingPoint ppoint; vector ar_ppoint; GE_CHK_STATUS_RET(FindProfilingTaskIndex(graph, ppoint, ar_ppoint)); const OpsKernelManager &ops_kernel_manager = ge_lib->OpsKernelManagerObj(); - uint32_t node_index = 0; GE_TIMESTAMP_CALLNUM_START(GenerateTask); + // map store l1 fusion nodes + map> l1_fusion_nodes; + string is_l1_fusion_enable = "false"; + graphStatus ret = ge::GetContext().GetOption("ge.l1Fusion", is_l1_fusion_enable); + if ((ret == GRAPH_SUCCESS) && (is_l1_fusion_enable == "true")) { + GE_CHK_STATUS_RET(SaveL1fusionNodes(l1_fusion_nodes, graph)); + } + std::unordered_set l1_fusion_nodes_seen; + int64_t group_id; + uint32_t node_index = 0; for (auto &node : graph->GetAllNodes()) { - GE_CHECK_NOTNULL(node->GetOpDesc()); - if (node->GetOpDesc()->GetType() == CONCAT) { - int64_t is_node_virtual; - GE_IF_BOOL_EXEC(ge::AttrUtils::GetInt(node->GetOpDesc(), "fusion_virtual_op", is_node_virtual), continue); - } - node_index++; OpDescPtr op_desc = node->GetOpDesc(); - GE_CHK_STATUS_RET(UpdateOpIsVarAttr(op_desc, graph->GetSessionID())); - + GE_CHECK_NOTNULL(op_desc); + node_index++; string name = node->GetName(); string type = node->GetType(); + bool attr_notask = false; + bool get_attr_notask_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_notask); + GE_IF_BOOL_EXEC(get_attr_notask_flag && attr_notask, + GELOGI("Node[name:%s, type:%s] does not need to generate task.", name.c_str(), type.c_str()); + continue); + + GE_CHK_STATUS_RET(UpdateOpIsVarAttr(op_desc, graph->GetSessionID())); string op_kernel_lib_name = op_desc->GetOpKernelLibName(); + // For l1 fusion ddb pass, task def must be continuous. + // Part2: Call + auto l1_fusion_task_info = + L1FusionTaskInfo{run_context, graph, node, op_desc, node_index, ge_lib, + ops_kernel_manager, task_def_list, op_name_map, ppoint, ar_ppoint}; + GE_CHK_STATUS_RET(GenerateTaskForL1FusionNode(l1_fusion_task_info, l1_fusion_nodes, l1_fusion_nodes_seen), + "Call GenerateTaskForL1FusionNode node:%s(%s) failed", name.c_str(), type.c_str()); + // continue directly + if (ge::AttrUtils::GetInt(op_desc, ATTR_NAME_L1_FUSION_GROUP_ID, group_id)) { + GELOGI("L1Fusion not %s to generate node[name:%s(%s) task again.", op_kernel_lib_name.c_str(), name.c_str(), + type.c_str()); + continue; + } if (op_kernel_lib_name.empty()) { - GELOGI("Node[name:%s(%s)] task no need to generate task.", name.c_str(), type.c_str()); + GELOGI("Node[name:%s, type:%s] does not need to generate task.", name.c_str(), type.c_str()); continue; } @@ -225,13 +295,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra type.c_str(), op_kernel_lib_name.c_str()); return INTERNAL_ERROR; } - - ret = UpdateAnchorStatus(node); - if (ret != SUCCESS) { - GELOGE(ret, "Call UpdateAnchorStatus node:%s(%s) failed", name.c_str(), type.c_str()); - return ret; - } - + GE_CHK_STATUS_RET(UpdateAnchorStatus(node), "Call UpdateAnchorStatus node:%s(%s) failed", name.c_str(), + type.c_str()); int64_t op_id = op_desc->GetId(); int64_t stream_id = op_desc->GetStreamId(); if (stream_id < 0 || stream_id >= static_cast(run_context.graphStreamList.size())) { @@ -247,7 +312,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task.", op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id); GE_TIMESTAMP_RESTART(GenerateTask); - ret = kernel_info_store->GenerateTask(*node, run_context, task_def_list); + auto ret = kernel_info_store->GenerateTask(*node, run_context, task_def_list); GE_TIMESTAMP_ADD(GenerateTask); if (ret != SUCCESS) { GELOGE(ret, "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task failed.", @@ -285,6 +350,113 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra return SUCCESS; } +Status TaskGenerator::GenerateTaskForL1FusionNode(L1FusionTaskInfo &fusion_task_info, + std::map> &l1_fusion_nodes, + std::unordered_set &l1_fusion_nodes_seen) { + Status ret = SUCCESS; + int64_t group_id; + auto &run_context = fusion_task_info.run_context; + auto &graph = fusion_task_info.graph; + auto &node = fusion_task_info.node; + auto &fusion_op_desc = fusion_task_info.fusion_op_desc; + auto &node_index = fusion_task_info.node_index; + const auto &ops_kernel_manager = fusion_task_info.ops_kernel_manager; + auto &task_def_list = fusion_task_info.task_def_list; + auto &op_name_map = fusion_task_info.op_name_map; + auto &ppoint = fusion_task_info.ppoint; + auto &ar_ppoint = fusion_task_info.ar_ppoint; + auto stream_id = fusion_op_desc->GetStreamId(); + // If op_desc have this attr, call nodes with same group id in a stream together + if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_L1_FUSION_GROUP_ID, group_id) && + (l1_fusion_nodes_seen.count(node.get()) == 0)) { + auto group_key = group_id + stream_id * kMaxNodeNumInNormalStream; + GELOGI("L1Fusion: start fusion group index[%ld], nodes size[%ld].", group_key, l1_fusion_nodes[group_key].size()); + for (auto &fusion_node : l1_fusion_nodes[group_key]) { + OpDescPtr op_desc = fusion_node->GetOpDesc(); + + UpdateOpIsVarAttr(op_desc, graph->GetSessionID()); + std::string fusion_node_name = fusion_node->GetName(); + std::string fusion_node_type = fusion_node->GetType(); + std::string op_kernel_lib_name = op_desc->GetOpKernelLibName(); + if (op_kernel_lib_name.empty()) { + GELOGI("L1Fusion: fusion_node[name:%s(%s)] task no need to generate task.", fusion_node_name.c_str(), + fusion_node_type.c_str()); + continue; + } + + size_t task_list_size_before = task_def_list.size(); + OpsKernelInfoStorePtr kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name); + if (kernel_info_store == nullptr) { + GELOGE(INTERNAL_ERROR, "L1Fusion: No ops kernel store found. fusion_node:%s(%s), op_kernel_lib_name=%s.", + fusion_node_name.c_str(), fusion_node_type.c_str(), op_kernel_lib_name.c_str()); + return INTERNAL_ERROR; + } + + ret = UpdateAnchorStatus(fusion_node); + if (ret != SUCCESS) { + GELOGE(ret, "L1Fusion: Call UpdateAnchorStatus fusion_node:%s(%s) failed", fusion_node_name.c_str(), + fusion_node_type.c_str()); + return ret; + } + + int64_t op_id = op_desc->GetId(); + int64_t stream_id = op_desc->GetStreamId(); + if (stream_id < 0 || stream_id >= (int64_t)run_context.graphStreamList.size()) { + GELOGE(INTERNAL_ERROR, "L1Fusion: fusion_node[name:%s(%s), id:%ld] stream id is invalid, stream list size=%zu", + fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, run_context.graphStreamList.size()); + return INTERNAL_ERROR; + } + // profiling task + (void)InsertProfilingTaskBefore(op_desc, ppoint, ar_ppoint, node_index, task_def_list); + run_context.stream = run_context.graphStreamList[stream_id]; + GELOGI("L1Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.", + op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); + ret = kernel_info_store->GenerateTask(*fusion_node, run_context, task_def_list); + if (ret != SUCCESS) { + GELOGE(ret, + "L1Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), " + "id:%ld, stream_id:%ld] task failed.", + op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id); + return ret; + } + // profiling task + (void)InsertProfilingTaskAfter(op_desc, ppoint, ar_ppoint, node_index, task_def_list); + size_t task_list_size_after = task_def_list.size(); + // if tasks is reduced + if (task_list_size_after < task_list_size_before) { + GELOGE(FAILED, + "L1Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), " + "id:%ld, stream_id:%ld] task. but task num from %zu to %zu.", + op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id, + task_list_size_before, task_list_size_after); + return FAILED; + } + + // reset stream id to ge stream id, as graph load must use ge stream to reassign stream + void *ops_kernel_info_store_ptr = kernel_info_store.get(); + for (size_t idx = task_list_size_before; idx < task_list_size_after; ++idx) { + task_def_list[idx].set_stream_id(static_cast(stream_id)); + op_name_map[idx] = fusion_node_name; + // set opsKernelInfoStorePtr and op_index, the two fields be use in DistributeTask and InitTaskInfo + TaskDef *task_def_ptr = &task_def_list[idx]; + task_def_ptr->set_ops_kernel_store_ptr(reinterpret_cast(ops_kernel_info_store_ptr)); + } + + GELOGI( + "L1Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld]" + " task finished, generate %u task(s).", + op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id, + task_list_size_after - task_list_size_before); + + // record nodes which have call generate task successfully + l1_fusion_nodes_seen.insert(fusion_node.get()); + node_index++; + } + } + // without tag or has been seen, skip directly + return ret; +} + Status TaskGenerator::UpdateAnchorStatus(const NodePtr &node) { if (NodeUtils::SetAllAnchorStatus(node) != GRAPH_SUCCESS) { GELOGE(INTERNAL_ERROR, "NodeUtils::SetAllAnchorStatus failed."); @@ -313,23 +485,26 @@ Status TaskGenerator::UpdateAnchorStatus(const NodePtr &node) { return SUCCESS; } -Status TaskGenerator::MarkFirstAndLastNode(ComputeGraphPtr &graph) { +Status TaskGenerator::MarkNodeAndSetIndex(ComputeGraphPtr &graph) { std::shared_ptr ge_lib = GELib::GetInstance(); if ((ge_lib == nullptr) || !ge_lib->InitFlag()) { GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized or is finalized"); return GE_CLI_GE_NOT_INITIALIZED; } + int64_t node_index = 0; map>> engine_stream_stat; for (auto &node : graph->GetAllNodes()) { - GE_CHECK_NOTNULL(node->GetOpDesc()); - string op_kernel_lib_name = node->GetOpDesc()->GetOpKernelLibName(); - int64_t stream_id = node->GetOpDesc()->GetStreamId(); + const OpDescPtr &op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + string op_kernel_lib_name = op_desc->GetOpKernelLibName(); + int64_t stream_id = op_desc->GetStreamId(); + op_desc->SetId(node_index++); if (op_kernel_lib_name.empty()) { // Reset op kernel lib - (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(node->GetOpDesc()); - op_kernel_lib_name = node->GetOpDesc()->GetOpKernelLibName(); + (void)ge_lib->DNNEngineManagerObj().GetDNNEngineName(op_desc); + op_kernel_lib_name = op_desc->GetOpKernelLibName(); if (op_kernel_lib_name.empty()) { GELOGE(INTERNAL_ERROR, "node:%s(%s) get op kernel lib failed.", node->GetName().c_str(), node->GetType().c_str()); @@ -378,11 +553,13 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi } const char *fp_point = std::getenv(kProfilingFpPoint); if (fp_point == nullptr) { + GELOGW("first forward profiling op name not set."); return SUCCESS; } string fp_point_str = string(fp_point); const char *bp_point = std::getenv(kProfilingBpPoint); if (bp_point == nullptr) { + GELOGW("last backward profiling op name not set."); return SUCCESS; } string bp_point_str = string(bp_point); @@ -422,6 +599,13 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi ppoint.fp_index = first_fp; ppoint.bp_index = last_bp; ppoint.end_index = iter_end; + bool train_graph = graph->GetNeedIteration(); + if (ppoint.fp_index == 0 && train_graph) { + GELOGE(FAILED, "First forward op name can't be found in graph for training trace."); + } + if (ppoint.bp_index == 0 && train_graph) { + GELOGE(FAILED, "Last backward op name can't be found in graph for training trace."); + } return SUCCESS; } diff --git a/src/ge/graph/build/task_generator.h b/src/ge/graph/build/task_generator.h index ae7f6885..7fa48ea1 100644 --- a/src/ge/graph/build/task_generator.h +++ b/src/ge/graph/build/task_generator.h @@ -30,11 +30,28 @@ #include "runtime/rt.h" namespace ge { +class GELib; +class OpsKernelManager; + struct ProfilingPoint { uint32_t fp_index = 0; uint32_t bp_index = 0; uint32_t end_index = 0; }; +// Describes infos needed by generate task for l1 fusion node +struct L1FusionTaskInfo { + RunContext &run_context; + ComputeGraphPtr &graph; + NodePtr &node; + OpDescPtr &fusion_op_desc; + uint32_t &node_index; + std::shared_ptr &ge_lib; + const OpsKernelManager &ops_kernel_manager; + std::vector &task_def_list; + std::map &op_name_map; + ProfilingPoint &ppoint; + vector ar_ppoint; +}; class TaskGenerator { public: @@ -87,7 +104,7 @@ class TaskGenerator { RunContext &run_context); // Mark first and last node according to the same stream and engine - Status MarkFirstAndLastNode(ComputeGraphPtr &graph); + Status MarkNodeAndSetIndex(ComputeGraphPtr &graph); // profiling interface Status FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &ppoint, @@ -100,6 +117,18 @@ class TaskGenerator { std::vector &task_def_list); static bool IsProfPoint(const OpDescPtr &op, const std::string &name); + /// call engine to generate task for l1 fusion node. + /// @param L1FusionTaskInfo + /// @param l1_fusion_nodes: nodes in graph with groud_id attr which means l1 fusion node + /// @param l1_fusion_nodes_seen: l1 fusion node has been called generate task + /// @return SUCCESS:seccess + /// Other: failed + /// + Status GenerateTaskForL1FusionNode(L1FusionTaskInfo &fusion_task_info, + std::map> &l1_fusion_nodes, + std::unordered_set &l1_fusion_nodes_seen); + + Status SaveL1fusionNodes(map> &l1_fusion_nodes, ComputeGraphPtr &graph); uint8_t *var_mem_base_ = nullptr; uint64_t var_mem_size_ = 0; diff --git a/src/ge/graph/common/bcast.h b/src/ge/graph/common/bcast.h index 429f153f..9df1c422 100644 --- a/src/ge/graph/common/bcast.h +++ b/src/ge/graph/common/bcast.h @@ -28,7 +28,6 @@ #include "graph/attr_value.h" #include "graph/ge_tensor.h" #include "graph/utils/tensor_adapter.h" -#include "unsupported/Eigen/CXX11/Tensor" namespace ge { static const size_t kMinDimNum = 2; @@ -106,24 +105,6 @@ class BCast { /// static kVecInt TransShapeToDimVec(const GeTensorDesc &shape); - /// - /// @ingroup domi_calibration - /// from Bcast::kVecInt to Eigen::array - /// @param [in] vec dim info - /// @return Eigen::array - /// - template - static Status ToIndexArray(const BCast::kVecInt &vec, Eigen::array &ret) { - if (vec.size() != NDIMS) { - GELOGE(domi::PARAM_INVALID, "ToIndexArray failed. size of vector = %zu is not equal to NDIMS = %d.", vec.size(), - NDIMS); - return domi::PARAM_INVALID; - } - for (int i = 0; i < NDIMS; ++i) { - ret[i] = vec[i]; - } - return domi::SUCCESS; - } void BCastIndexes(kVecInt &x_indexes, kVecInt &y_indexes); template Status BCastCompute(const std::vector &input, std::vector &v_output, diff --git a/src/ge/graph/common/omg_util.cc b/src/ge/graph/common/omg_util.cc index 31c3f06f..00091c10 100644 --- a/src/ge/graph/common/omg_util.cc +++ b/src/ge/graph/common/omg_util.cc @@ -18,7 +18,6 @@ #include -#include "common/op/attr_define.h" #include "framework/common/debug/ge_log.h" #include "graph/debug/ge_attr_define.h" #include "graph/utils/graph_utils.h" @@ -58,7 +57,7 @@ Status SetStreamLabel(const ge::NodePtr &node, const std::string &label) { OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); - if (!AttrUtils::SetStr(tmp_desc, ATTR_NAME_STREAM_LABEL, label)) { + if (!AttrUtils::SetStr(tmp_desc, ge::ATTR_NAME_STREAM_LABEL, label)) { GELOGE(FAILED, "Op: %s set ATTR_NAME_STREAM_LABEL failed", node->GetName().c_str()); return FAILED; } @@ -75,7 +74,7 @@ Status SetCycleEvent(const ge::NodePtr &node) { GE_CHECK_NOTNULL(node); OpDescPtr tmp_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(tmp_desc); - if (!AttrUtils::SetBool(tmp_desc, ATTR_NAME_STREAM_CYCLE_EVENT_FLAG, true)) { + if (!AttrUtils::SetBool(tmp_desc, ge::ATTR_NAME_STREAM_CYCLE_EVENT_FLAG, true)) { GELOGE(FAILED, "Op: %s set ATTR_NAME_STREAM_CYCLE_EVENT_FLAG failed", node->GetName().c_str()); return FAILED; } diff --git a/src/ge/graph/common/transop_util.cc b/src/ge/graph/common/transop_util.cc index b9754bed..3250929d 100644 --- a/src/ge/graph/common/transop_util.cc +++ b/src/ge/graph/common/transop_util.cc @@ -24,8 +24,8 @@ const int kInvalidTransopDataIndex = -1; namespace ge { TransOpUtil::TransOpUtil() { - transop_index_map_ = {{TRANSDATA, 0}, {TRANSPOSE, 0}, {TRANSPOSED, 0}, - {RESHAPE, 0}, {REFORMAT, 0}, {CAST, 0}}; + transop_index_map_ = {{TRANSDATA, 0}, {TRANSPOSE, 0}, {TRANSPOSED, 0}, {RESHAPE, 0}, + {REFORMAT, 0}, {CAST, 0}, {SQUEEZE, 0}, {EXPANDDIMS, 0}}; } TransOpUtil::~TransOpUtil() {} diff --git a/src/ge/graph/execute/graph_execute.cc b/src/ge/graph/execute/graph_execute.cc index a1a30922..0f83a494 100644 --- a/src/ge/graph/execute/graph_execute.cc +++ b/src/ge/graph/execute/graph_execute.cc @@ -15,7 +15,10 @@ */ #include "graph/execute/graph_execute.h" + +#include #include + #include "common/ge_inner_error_codes.h" #include "common/model_parser/base.h" #include "graph/load/new_model_manager/model_manager.h" @@ -24,10 +27,6 @@ #include "runtime/mem.h" namespace ge { -namespace { -const char ENGINE_AI_CORE[] = "DNN_V100"; -} // namespace - GraphExecutor::GraphExecutor() : init_flag_(false), train_graph_flag_(false), @@ -159,38 +158,41 @@ Status GraphExecutor::PrepareInputData(const std::vector &input_tensor graph_input_data.timestamp = 0; std::size_t inputSize = input_tensor.size(); std::size_t output_size = output_desc.size(); - std::vector buffer_size_vec; - std::vector addr_vec; + std::vector bufferSizeVec; + std::vector addrVec; for (std::size_t i = 0; i < inputSize; ++i) { const GeTensor *InTensor = &input_tensor[i]; GE_CHECK_NOTNULL(InTensor); - buffer_size_vec.push_back(static_cast(InTensor->GetData().size())); + bufferSizeVec.push_back(InTensor->GetData().size()); } for (const auto &desc : output_desc) { - buffer_size_vec.push_back(desc.size); + bufferSizeVec.push_back(desc.size); } - Status ret = MallocInOutBuffer(buffer_size_vec, addr_vec); + Status ret = MallocInOutBuffer(bufferSizeVec, addrVec); if (ret != SUCCESS) { GELOGE(GE_GRAPH_MALLOC_FAILED, "[GraphExecutor] Malloc mem failed"); return GE_GRAPH_MALLOC_FAILED; } - for (std::size_t i = 0; i < input_tensor.size() && i < addr_vec.size(); ++i) { + for (std::size_t i = 0; i < input_tensor.size() && i < addrVec.size(); ++i) { const GeTensor *in_tensor = &input_tensor[i]; GE_CHECK_NOTNULL(in_tensor); - if ((addr_vec[i] != nullptr) && (in_tensor->GetData().data() != nullptr)) { - if (memcpy_s(addr_vec[i], buffer_size_vec[i], in_tensor->GetData().data(), in_tensor->GetData().size()) != 0) { - GELOGE(GE_GRAPH_EXECUTE_FAILED, "[GraphExecutor] memcpy input data failed."); + if ((addrVec[i] != nullptr) && (in_tensor->GetData().data() != nullptr)) { + errno_t s_ret = memcpy_s(addrVec[i], bufferSizeVec[i], in_tensor->GetData().data(), in_tensor->GetData().size()); + if (s_ret != 0) { + GELOGE(GE_GRAPH_EXECUTE_FAILED, + "[GraphExecutor] memcpy input data failed, errno: %d, dst size: %u, src size: %zu.", s_ret, + bufferSizeVec[i], in_tensor->GetData().size()); return GE_GRAPH_EXECUTE_FAILED; } } DataBuffer in_data_buf; - in_data_buf.data = reinterpret_cast(addr_vec[i]); - in_data_buf.length = static_cast(in_tensor->GetData().size()); + in_data_buf.data = reinterpret_cast(addrVec[i]); + in_data_buf.length = in_tensor->GetData().size(); in_data_buf.isDataSupportMemShare = false; graph_input_data.blobs.push_back(in_data_buf); } @@ -202,7 +204,7 @@ Status GraphExecutor::PrepareInputData(const std::vector &input_tensor uint32_t buffer_size = desc.size; DataBuffer out_data_buf; - out_data_buf.data = reinterpret_cast(addr_vec[inputSize + j]); + out_data_buf.data = reinterpret_cast(addrVec[inputSize + j]); out_data_buf.length = buffer_size; out_data_buf.isDataSupportMemShare = false; graph_output_data.blobs.push_back(out_data_buf); @@ -264,34 +266,32 @@ Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vector out_buf_tmp(new (std::nothrow) uint8_t[out_data_tmp.length]); - if (out_buf_tmp == nullptr) { + std::unique_ptr outBufTmp(new (std::nothrow) uint8_t[outputDataTmp.length]); + if (outBufTmp == nullptr) { GELOGE(FAILED, "Failed to allocate memory."); return FAILED; } - rtError_t ret_value = rtMemcpy(out_buf_tmp.get(), out_data_tmp.length, out_data_tmp.data, out_data_tmp.length, + GE_PRINT_DYNAMIC_MEMORY(new, "the output memory of data on training.", sizeof(uint8_t) * outputDataTmp.length) + rtError_t ret_value = rtMemcpy(outBufTmp.get(), outputDataTmp.length, outputDataTmp.data, outputDataTmp.length, RT_MEMCPY_DEVICE_TO_HOST); CHECK_FALSE_EXEC(ret_value == RT_ERROR_NONE, GELOGE(GE_GRAPH_EXECUTE_FAILED, "Call rt api rtMemcpy failed, ret: 0x%X", ret); return GE_GRAPH_EXECUTE_FAILED); - GeTensor out_tensor; - std::vector shape_dims; + GeTensor outTensor; + std::vector shapeDims; for (const auto &dim : output_desc[i].shape_info.dims) { - shape_dims.push_back(dim); + shapeDims.push_back(dim); } - GeShape out_shape(shape_dims); - out_tensor.MutableTensorDesc().SetShape(out_shape); - out_tensor.MutableTensorDesc().SetDataType((DataType)output_desc[i].data_type); - if (out_tensor.SetData(out_buf_tmp.get(), out_data_tmp.length) != SUCCESS) { - GELOGE(FAILED, "Out tensor set data failed"); - return FAILED; - } - output_tensor.push_back(out_tensor); + GeShape outShape(shapeDims); + outTensor.MutableTensorDesc().SetShape(outShape); + outTensor.MutableTensorDesc().SetDataType((DataType)output_desc[i].data_type); + outTensor.SetData(outBufTmp.get(), outputDataTmp.length); + output_tensor.push_back(outTensor); } GELOGI("[GraphExecutor] execute model success, modelId=%u.", model_id); @@ -464,6 +464,24 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vector> &batch_info) { + auto model_manager = ge::ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + Status ret = model_manager->GetDynamicBatchInfo(model_id, batch_info); + if (ret != SUCCESS) { + GELOGE(ret, "GetDynamicBatchInfo failed."); + return ret; + } + + return SUCCESS; +} Status GraphExecutor::GetInputOutputDescInfoForZeroCopy(uint32_t model_id, vector &input_desc, vector &output_desc, diff --git a/src/ge/graph/execute/graph_execute.h b/src/ge/graph/execute/graph_execute.h index 35376184..5e926ae3 100644 --- a/src/ge/graph/execute/graph_execute.h +++ b/src/ge/graph/execute/graph_execute.h @@ -71,6 +71,15 @@ class GraphExecutor { vector &output_desc, std::vector &input_formats, std::vector &output_formats); + /// + /// @ingroup ge + /// @brief Get dynamic batch_info + /// @param [in] model_id + /// @param [out] batch_info + /// @return execute result + /// + static Status GetDynamicBatchInfo(uint32_t model_id, std::vector> &batch_info); + static Status GetInputOutputDescInfoForZeroCopy(uint32_t model_id, vector &input_desc, vector &output_desc, std::vector &input_formats, diff --git a/src/ge/graph/label/case_label_maker.cc b/src/ge/graph/label/case_label_maker.cc new file mode 100644 index 00000000..4d477bb7 --- /dev/null +++ b/src/ge/graph/label/case_label_maker.cc @@ -0,0 +1,130 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "case_label_maker.h" + +#include "common/util.h" +#include "common/ge_inner_error_codes.h" +#include "framework/common/types.h" +#include "framework/common/op/ge_op_utils.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +constexpr uint32_t kCasePredIndex = 0; +constexpr uint32_t kMinCaseBranch = 1; +constexpr uint32_t kMaxCaseBranch = 0x7fffffff; + +/** + * @ingroup ge + * @brief Make label node to functional call. + * @param [in/out] label_index: serial id for whole graph. + * @return: 0 for success / others for fail + */ +Status CaseOpLabelMaker::Run(uint32_t &label_index) { + GE_CHECK_NOTNULL(parent_node_); + GE_CHECK_NOTNULL(parent_graph_); + + OpDescPtr case_desc = parent_node_->GetOpDesc(); + GE_CHECK_NOTNULL(case_desc); + + const auto graph_names = case_desc->GetSubgraphInstanceNames(); + if (graph_names.empty() || graph_names.size() > kMaxCaseBranch) { + GELOGE(INTERNAL_ERROR, "Node: %s has invalid subgraph, graph size: %zu.", case_desc->GetName().c_str(), + graph_names.size()); + return FAILED; + } + + // One branch, no need label. + const uint32_t graph_num = static_cast(graph_names.size()); + if (graph_num == kMinCaseBranch) { + GELOGI("Node: %s just one subgraph.", case_desc->GetName().c_str()); + return SUCCESS; + } + + NodePtr first_label = nullptr; + ComputeGraphPtr first_graph = nullptr; + std::vector switch_labels; + uint32_t last_label_index = label_index++; + for (uint32_t index = 0; index < graph_num; ++index) { + ComputeGraphPtr graph = parent_graph_->GetSubgraph(graph_names[index]); + GE_CHECK_NOTNULL(graph); + + // all branch, add label node to head. + uint32_t curr_label_index = label_index++; + std::string label_set_name = parent_node_->GetName() + "/LabelSet_" + std::to_string(index); // rtLabelSet + NodePtr label = AddLabelSetEnter(graph, label_set_name, curr_label_index); + if (label == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", graph->GetName().c_str()); + return FAILED; + } + switch_labels.emplace_back(curr_label_index); + if (index == 0) { // save first subgraph node for switch. + first_label = label; + first_graph = graph; + } + + if (index + 1 < graph_num) { + // middle node, add goto node to tail. + std::string label_goto_name = parent_node_->GetName() + "/LabelGoto_" + std::to_string(index); // rtLabelGoto + if (AddLabelGotoLeave(graph, label_goto_name, last_label_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label goto failed.", graph->GetName().c_str()); + return FAILED; + } + } else { + // last node, add label node to tail. + std::string last_label_name = parent_node_->GetName() + "/LabelSet_Last"; // rtLabelSet + if (AddLabelSetLeave(graph, last_label_name, last_label_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", graph->GetName().c_str()); + return FAILED; + } + } + } + + // Add Switch node for first branch. + GE_CHECK_NOTNULL(first_label); + GE_CHECK_NOTNULL(first_graph); + + GeTensorDesc pred_desc = case_desc->GetInputDesc(kCasePredIndex); + GeTensorDesc cond_desc(GeShape(pred_desc.GetShape().GetDims()), pred_desc.GetFormat(), DT_UINT32); + + // first case, add switch node to head. + const std::string label_switch_name = parent_node_->GetName() + "/LabelSwitch"; // rtLabelSwitchByIndex + NodePtr switch_node = AddLabelSwitchEnter(first_graph, label_switch_name, cond_desc, switch_labels); + if (switch_node == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label switch failed.", first_graph->GetName().c_str()); + return FAILED; + } + + // Link control edge to then branch head. + if (GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), first_label->GetInControlAnchor()) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add ctrl edge to %s failed.", first_label->GetName().c_str()); + return FAILED; + } + + uint32_t parent_index = 0; // Case cond input is first. + const std::string data_name = parent_node_->GetName() + "/SwitchIndexData"; + if (AddLabelSwitchIndex(first_graph, data_name, cond_desc, switch_node, parent_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add switch input failed.", first_graph->GetName().c_str()); + return FAILED; + } + + GELOGI("Node: %s assign label success.", case_desc->GetName().c_str()); + return SUCCESS; +} + +REGISTER_LABEL_MAKER(CASE, CaseOpLabelMaker); +} // namespace ge diff --git a/src/ge/graph/label/case_label_maker.h b/src/ge/graph/label/case_label_maker.h new file mode 100644 index 00000000..3c43911c --- /dev/null +++ b/src/ge/graph/label/case_label_maker.h @@ -0,0 +1,94 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_CASE_OP_LABEL_PASS_H_ +#define GE_GRAPH_PASSES_CASE_OP_LABEL_PASS_H_ + +#include "graph/node.h" +#include "graph/label/label_maker.h" +/******************************************************************************* + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ + | Case | + +-----------+ + +-----------+ + | Node | +-----------+ + +-----------+ /|SwitchByIdx| + | Node | A +-----------+ + +-----------+ / \|LabelSet(0)| + | Case | | +-----------+ + +-----------+ | | c | + | Node | | +-----------+ + +-----------+ | | a | + | Node | | +-----------+ + +-----------+ | | s | + | Node | | +-----------+ + +-----------+ | | e | + | +-----------+ + ====> | | LabelGoto |\ + V +-----------+ \ + |\ \ + | \ +-----------+ | + +-----------+ +-----------+ +-----------+ | \|LabelSet(1)| | + | c | | c | | c | | +-----------+ | + +-----------+ +-----------+ +-----------+ | | c | | + | a | | a | | a | | +-----------+ | + +-----------+ +-----------+ +-----------+ | | a | | + | s | | s | | s | | +-----------+ | + +-----------+ +-----------+ +-----------+ | | s | | + | e | | e | | e | | +-----------+ | + +-----------+ +-----------+ +-----------+ | | e | | + | +-----------+ V + | | LabelGoto |\ | + V +-----------+ \ | + \ \| + \ +-----------+ | + \|LabelSet(2)| | + +-----------+ | + | c | | + +-----------+ | + | a | | + +-----------+ | + | s | | + +-----------+ V + | e | / + +-----------+ / + | LabelSet |/ + +-----------+ + + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ +*******************************************************************************/ + +namespace ge { +class CaseOpLabelMaker : public LabelMaker { + public: + CaseOpLabelMaker(const ComputeGraphPtr &graph, const NodePtr &owner) : LabelMaker(graph, owner) {} + + ~CaseOpLabelMaker() override {} + + virtual Status Run(uint32_t &label_index); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_CASE_OP_LABEL_PASS_H_ diff --git a/src/ge/graph/label/if_label_maker.cc b/src/ge/graph/label/if_label_maker.cc new file mode 100644 index 00000000..5a7c028b --- /dev/null +++ b/src/ge/graph/label/if_label_maker.cc @@ -0,0 +1,118 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "if_label_maker.h" + +#include "common/util.h" +#include "common/ge_inner_error_codes.h" +#include "framework/common/types.h" +#include "framework/common/op/ge_op_utils.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +constexpr uint8_t kIfPredIndex = 0; +constexpr uint8_t kThenBranchIndex = 0; +constexpr uint8_t kElseBranchIndex = 1; + +/** + * @ingroup ge + * @brief Make label node to functional call. + * @param [in/out] label_index: serial id for whole graph. + * @return: 0 for success / others for fail + */ +Status IfOpLabelMaker::Run(uint32_t &label_index) { + GE_CHECK_NOTNULL(parent_node_); + GE_CHECK_NOTNULL(parent_graph_); + + OpDescPtr if_desc = parent_node_->GetOpDesc(); + GE_CHECK_NOTNULL(if_desc); + + const std::string then_branch_name = if_desc->GetSubgraphInstanceName(kThenBranchIndex); + const std::string else_branch_name = if_desc->GetSubgraphInstanceName(kElseBranchIndex); + if (then_branch_name.empty() || else_branch_name.empty()) { + GELOGE(INTERNAL_ERROR, "Node: %s has invalid subgraph, then branch: %s, else branch: %s.", + if_desc->GetName().c_str(), then_branch_name.c_str(), else_branch_name.c_str()); + return FAILED; + } + + ComputeGraphPtr then_sub_graph = parent_graph_->GetSubgraph(then_branch_name); + ComputeGraphPtr else_sub_graph = parent_graph_->GetSubgraph(else_branch_name); + GE_CHECK_NOTNULL(then_sub_graph); + GE_CHECK_NOTNULL(else_sub_graph); + + const uint32_t then_enter_index = label_index++; + const uint32_t else_enter_index = label_index++; + const uint32_t else_leave_index = label_index++; + const std::string then_enter_name = parent_node_->GetName() + "/LabelSwitch"; // rtLabelSwitchByIndex + const std::string then_label_name = parent_node_->GetName() + "/ThenLabelSet"; // rtLabelSet(0) + const std::string then_leave_name = parent_node_->GetName() + "/LabelGoto"; // rtLabelGoto + const std::string else_enter_name = parent_node_->GetName() + "/ElseLabelSet"; // rtLabelSet(1) + const std::string else_leave_name = parent_node_->GetName() + "/LeaveLabelSet"; // rtLabelSet + + NodePtr then_enter_label = AddLabelSetEnter(then_sub_graph, then_label_name, then_enter_index); + if (then_enter_label == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", then_sub_graph->GetName().c_str()); + return FAILED; + } + + if (AddLabelGotoLeave(then_sub_graph, then_leave_name, else_leave_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label goto failed.", then_sub_graph->GetName().c_str()); + return FAILED; + } + + if (AddLabelSetEnter(else_sub_graph, else_enter_name, else_enter_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", else_sub_graph->GetName().c_str()); + return FAILED; + } + if (AddLabelSetLeave(else_sub_graph, else_leave_name, else_leave_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", else_sub_graph->GetName().c_str()); + return FAILED; + } + + // false ==> 0 ==> switch_labels[0] ==> else_enter_index + // true ==> 1 ==> switch_labels[1] ==> then_enter_index + const std::vector switch_labels = {else_enter_index, then_enter_index}; + + GeTensorDesc pred_desc = if_desc->GetInputDesc(kIfPredIndex); + GeTensorDesc cond_desc(GeShape(pred_desc.GetShape().GetDims()), pred_desc.GetFormat(), DT_UINT32); + NodePtr switch_node = AddLabelSwitchEnter(then_sub_graph, then_enter_name, cond_desc, switch_labels); + if (switch_node == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label switch failed.", then_sub_graph->GetName().c_str()); + return FAILED; + } + + // Link control edge to then branch head. + if (GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), then_enter_label->GetInControlAnchor()) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add ctrl edge to %s failed.", then_enter_label->GetName().c_str()); + return FAILED; + } + + uint32_t parent_index = 0; // If cond input is first. + const std::string data_name = parent_node_->GetName() + "/SwitchIndexData"; + if (AddLabelSwitchIndex(then_sub_graph, data_name, cond_desc, switch_node, parent_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add switch input failed.", then_sub_graph->GetName().c_str()); + return FAILED; + } + + GELOGI("Node: %s assign label success.", if_desc->GetName().c_str()); + return SUCCESS; +} + +REGISTER_LABEL_MAKER(IF, IfOpLabelMaker); +REGISTER_LABEL_MAKER(_IF, IfOpLabelMaker); +REGISTER_LABEL_MAKER(STATELESSIF, IfOpLabelMaker); +} // namespace ge diff --git a/src/ge/graph/label/if_label_maker.h b/src/ge/graph/label/if_label_maker.h new file mode 100644 index 00000000..1ee41819 --- /dev/null +++ b/src/ge/graph/label/if_label_maker.h @@ -0,0 +1,80 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_IF_OP_LABEL_PASS_H_ +#define GE_GRAPH_PASSES_IF_OP_LABEL_PASS_H_ + +#include "graph/node.h" +#include "graph/label/label_maker.h" +/******************************************************************************* + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ + | If | + +-----------+ + +-----------+ + | Node | +-----------+ + +-----------+ /|SwitchByIdx| + | Node | A +-----------+ + +-----------+ / \|LabelSet(1)| + | If | | +-----------+ + +-----------+ | | t | + | Node | | +-----------+ + +-----------+ | | h | + | Node | | +-----------+ + +-----------+ | | e | + | Node | | +-----------+ + +-----------+ | | n | + | +-----------+ + ====> | | LabelGoto |\ + V +-----------+ \ + +-----------+ +-----------+ \ \ + | t | | e | \ +-----------+ | + +-----------+ +-----------+ \|LabelSet(0)| | + | h | | l | +-----------+ | + +-----------+ +-----------+ | e | | + | e | | s | +-----------+ | + +-----------+ +-----------+ | l | | + | n | | e | +-----------+ | + +-----------+ +-----------+ | s | | + +-----------+ V + | e | / + +-----------+ / + | LabelSet |/ + +-----------+ + + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ +*******************************************************************************/ + +namespace ge { +class IfOpLabelMaker : public LabelMaker { + public: + IfOpLabelMaker(const ComputeGraphPtr &graph, const NodePtr &owner) : LabelMaker(graph, owner) {} + + ~IfOpLabelMaker() override {} + + virtual Status Run(uint32_t &label_index); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_IF_OP_LABEL_PASS_H_ diff --git a/src/ge/graph/label/label_maker.cc b/src/ge/graph/label/label_maker.cc new file mode 100644 index 00000000..9ab6824c --- /dev/null +++ b/src/ge/graph/label/label_maker.cc @@ -0,0 +1,392 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/label/label_maker.h" + +#include "common/util.h" +#include "common/ge_inner_error_codes.h" +#include "framework/common/types.h" +#include "framework/common/op/ge_op_utils.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +constexpr static int64_t kInvalidStreamId = -1; + +/** + * @ingroup ge + * @brief Set stream id for head node. + * @param [in] graph: graph for add node. + * @param [in] op_desc: OpDesc for set logical stream id. + * @return: void + */ +void LabelMaker::SetStreamIdEnter(const ComputeGraphPtr &graph, const OpDescPtr &op_desc) { + int64_t stream_id = kInvalidStreamId; + const auto &node_list = graph->GetDirectNode(); + for (size_t i = 0; i < node_list.size(); ++i) { + const auto &node = node_list.at(i); + GE_CHECK_NOTNULL_EXEC(node, continue); + + stream_id = node->GetOpDesc()->GetStreamId(); + if (stream_id != kInvalidStreamId) { + break; + } + } + + GELOGI("SetStreamId: Node %s assign stream is %ld.", op_desc->GetName().c_str(), stream_id); + op_desc->SetStreamId(stream_id); +} + +/** + * @ingroup ge + * @brief Set stream id for tail node. + * @param [in] graph: graph for add node. + * @param [in] op_desc: OpDesc for set logical stream id. + * @return: void + */ +void LabelMaker::SetStreamIdLeave(const ComputeGraphPtr &graph, const OpDescPtr &op_desc) { + int64_t stream_id = kInvalidStreamId; + const auto &node_list = graph->GetDirectNode(); + for (size_t i = node_list.size(); i > 0; --i) { + const auto &node = node_list.at(i - 1); // i from list size, need shift 1. + GE_CHECK_NOTNULL_EXEC(node, continue); + + stream_id = node->GetOpDesc()->GetStreamId(); + if (stream_id != kInvalidStreamId) { + break; + } + } + + GELOGI("SetStreamId: Node %s assign stream is %ld.", op_desc->GetName().c_str(), stream_id); + op_desc->SetStreamId(stream_id); +} + +/** + * @ingroup ge + * @brief Link Node to Graph head. + * @param [in] graph: graph for add node. + * @param [in] lb_node: Node for set link to head. + * @return: SUCCESS / FAILED + */ +Status LabelMaker::AddCtrlLink2Data(const ComputeGraphPtr &graph, const NodePtr &node) { + GE_CHECK_NOTNULL(graph); + GE_CHECK_NOTNULL(node); + + std::set linked_nodes; + for (const NodePtr &n : graph->GetDirectNode()) { + GE_CHECK_NOTNULL(n); + if (n->GetType() != DATA) { + continue; + } + + // Link control edge to graph head. + for (const NodePtr &out_node : n->GetOutAllNodes()) { + if (linked_nodes.count(out_node) > 0) { + continue; + } + + (void)linked_nodes.insert(out_node); + if (GraphUtils::AddEdge(node->GetOutControlAnchor(), out_node->GetInControlAnchor()) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSet: Add ctrl edge to %s failed.", node->GetName().c_str()); + return FAILED; + } + } + } + + return SUCCESS; +} + +/** + * @ingroup ge + * @brief Add LabelSet node at graph front. + * @param [in] graph: graph for add node. + * @param [in] name: label set node name. + * @param [in] index: label id for set. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelSetEnter(const ComputeGraphPtr &graph, const std::string &name, uint32_t index) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + const auto &node_list = graph->GetDirectNode(); + auto it = node_list.begin(); + if (it == node_list.end()) { + GELOGE(INTERNAL_ERROR, "LabelSet: Graph %s node is empty.", graph->GetName().c_str()); + return nullptr; + } + + OpDescPtr op_desc = MakeShared(name, LABELSET); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + SetStreamIdEnter(graph, op_desc); + + GELOGI("LabelSet: Create node %s.", op_desc->GetName().c_str()); + (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index); + NodePtr label_set = graph->AddNodeFront(op_desc); + GE_CHECK_NOTNULL_EXEC(label_set, return nullptr); + + // Link control edge to graph head. + if (AddCtrlLink2Data(graph, label_set) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSet: Add ctrl edge to %s failed.", graph->GetName().c_str()); + return nullptr; + } + + return label_set; +} + +/** + * @ingroup ge + * @brief Add LabelSet node at graph back. + * @param [in] graph: graph for add node. + * @param [in] name: label set node name. + * @param [in] index: label id for set. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelSetLeave(const ComputeGraphPtr &graph, const std::string &name, uint32_t index) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + const auto &node_list = graph->GetDirectNode(); + auto it = node_list.end(); + if (it == node_list.begin()) { + GELOGE(INTERNAL_ERROR, "LabelSet: Graph %s node is empty.", graph->GetName().c_str()); + return nullptr; + } + --it; + const NodePtr &node = *it; + GE_CHECK_NOTNULL_EXEC(node, return nullptr); + + OpDescPtr op_desc = MakeShared(name, LABELSET); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + SetStreamIdLeave(graph, op_desc); + + GELOGI("LabelSet: Create node %s.", op_desc->GetName().c_str()); + (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index); + NodePtr label_set = graph->AddNodeFront(op_desc); + GE_CHECK_NOTNULL_EXEC(label_set, return nullptr); + + // Link control edge to graph tail. + if (GraphUtils::AddEdge(node->GetOutControlAnchor(), label_set->GetInControlAnchor()) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSet: Add ctrl edge to %s failed.", node->GetName().c_str()); + return nullptr; + } + + return label_set; +} + +/** + * @ingroup ge + * @brief Add LabelGoto node at graph front. + * @param [in] graph: graph for add node. + * @param [in] name: label goto node name. + * @param [in] index: label id for goto. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelGotoEnter(const ComputeGraphPtr &graph, const std::string &name, uint32_t index) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + const auto &node_list = graph->GetDirectNode(); + auto it = node_list.begin(); + if (it == node_list.end()) { + GELOGE(INTERNAL_ERROR, "LabelGoto: Graph %s node is empty.", graph->GetName().c_str()); + return nullptr; + } + + OpDescPtr op_desc = MakeShared(name, LABELGOTO); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + SetStreamIdEnter(graph, op_desc); + + GELOGI("LabelGoto: Create node %s.", op_desc->GetName().c_str()); + (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index); + NodePtr label_goto = graph->AddNodeFront(op_desc); + if (label_goto == nullptr) { + GELOGE(INTERNAL_ERROR, "LabelGoto: Add to graph %s failed.", graph->GetName().c_str()); + return nullptr; + } + + return label_goto; +} + +/** + * @ingroup ge + * @brief Add LabelGoto node at graph back. + * @param [in] graph: graph for add node. + * @param [in] name: label goto node name. + * @param [in] index: label id for goto. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelGotoLeave(const ComputeGraphPtr &graph, const std::string &name, uint32_t index) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + const auto &node_list = graph->GetDirectNode(); + auto it = node_list.end(); + if (it == node_list.begin()) { + GELOGE(INTERNAL_ERROR, "LabelGoto: Graph %s node is empty.", graph->GetName().c_str()); + return nullptr; + } + --it; + const NodePtr &node = *it; + GE_CHECK_NOTNULL_EXEC(node, return nullptr); + + OpDescPtr op_desc = MakeShared(name, LABELGOTO); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + SetStreamIdLeave(graph, op_desc); + + GELOGI("LabelGoto: Create node %s.", op_desc->GetName().c_str()); + (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index); + NodePtr label_goto = graph->AddNode(op_desc); + GE_CHECK_NOTNULL_EXEC(label_goto, return nullptr); + + // Link control edge to graph tail. + if (GraphUtils::AddEdge(node->GetOutControlAnchor(), label_goto->GetInControlAnchor()) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelGoto: Add ctrl edge to %s failed.", node->GetName().c_str()); + return nullptr; + } + + return label_goto; +} + +/** + * @ingroup ge + * @brief Add LabelSwitch node at graph front. + * @param [in] graph: graph for add node. + * @param [in] name: label switch node name. + * @param [in] desc: label index data desc. + * @param [in] labels: label id for switch. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelSwitchEnter(const ComputeGraphPtr &graph, const std::string &name, const GeTensorDesc &desc, + const std::vector &labels) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + const auto &node_list = graph->GetDirectNode(); + auto it = node_list.begin(); + if (it == node_list.end()) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Graph %s node is empty.", graph->GetName().c_str()); + return nullptr; + } + + OpDescPtr op_desc = MakeShared(name, LABELSWITCHBYINDEX); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + SetStreamIdEnter(graph, op_desc); + + GELOGI("LabelSwitchByIndex: Create node %s.", op_desc->GetName().c_str()); + if (op_desc->AddInputDesc(desc) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add input desc failed."); + return nullptr; + } + + if (!AttrUtils::SetListInt(op_desc, ATTR_NAME_LABEL_SWITCH_LIST, labels)) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add %s failed.", ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); + return nullptr; + } + + NodePtr label_switch = graph->AddNodeFront(op_desc); + if (label_switch == nullptr) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add to graph %s failed.", graph->GetName().c_str()); + return nullptr; + } + + return label_switch; +} + +/** + * @ingroup ge + * @brief Add LabelSwitch node at graph back. + * @param [in] graph: graph for add node. + * @param [in] name: label switch node name. + * @param [in] desc: label index data desc. + * @param [in] labels: label id for switch. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelSwitchLeave(const ComputeGraphPtr &graph, const std::string &name, const GeTensorDesc &desc, + const std::vector &labels) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + const auto &node_list = graph->GetDirectNode(); + auto it = node_list.end(); + if (it == node_list.begin()) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Graph %s node is empty.", graph->GetName().c_str()); + return nullptr; + } + --it; + const NodePtr &node = *it; + GE_CHECK_NOTNULL_EXEC(node, return nullptr); + + OpDescPtr op_desc = MakeShared(name, LABELSWITCHBYINDEX); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + SetStreamIdLeave(graph, op_desc); + + GELOGI("LabelSwitchByIndex: Create node %s.", op_desc->GetName().c_str()); + if (op_desc->AddInputDesc(desc) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add input desc failed."); + return nullptr; + } + + if (!AttrUtils::SetListInt(op_desc, ATTR_NAME_LABEL_SWITCH_LIST, labels)) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add %s failed.", ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); + return nullptr; + } + + NodePtr label_switch = graph->AddNode(op_desc); + GE_CHECK_NOTNULL_EXEC(label_switch, return nullptr); + + // Link control edge to graph tail. + if (GraphUtils::AddEdge(node->GetOutControlAnchor(), label_switch->GetInControlAnchor()) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add ctrl edge to %s failed.", node->GetName().c_str()); + return nullptr; + } + + return label_switch; +} + +/** + * @ingroup ge + * @brief Add Data node at graph front for switch input. + * @param [in] graph: graph for add node. + * @param [in] name: label switch node name. + * @param [in] desc: label index data desc. + * @param [in] sw_node: switch node for add input. + * @param [in] parent_index: index for parent node. + * @return: NodePtr for success / nullptr for fail + */ +NodePtr LabelMaker::AddLabelSwitchIndex(const ComputeGraphPtr &graph, const std::string &name, const GeTensorDesc &desc, + const NodePtr &sw_node, uint32_t parent_index) { + GE_CHECK_NOTNULL_EXEC(graph, return nullptr); + + OpDescPtr op_desc = MakeShared(name, DATA); + GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + + GELOGI("Data: Create node %s.", op_desc->GetName().c_str()); + if (op_desc->AddOutputDesc(desc) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add data output desc failed."); + return nullptr; + } + + if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add %s failed.", ATTR_NAME_PARENT_NODE_INDEX.c_str()); + return nullptr; + } + NodePtr op_data = graph->AddNodeFront(op_desc); + GE_CHECK_NOTNULL_EXEC(op_data, return nullptr); + GE_CHECK_NOTNULL_EXEC(graph->AddInputNode(op_data), return nullptr); // take as input node for memory assign. + + // Link control edge to graph head. + if (GraphUtils::AddEdge(op_data->GetOutDataAnchor(0), sw_node->GetInDataAnchor(0)) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "LabelSwitchByIndex: Add input edge to %s failed.", op_data->GetName().c_str()); + return nullptr; + } + + return op_data; +} +} // namespace ge diff --git a/src/ge/graph/label/label_maker.h b/src/ge/graph/label/label_maker.h new file mode 100644 index 00000000..6b5ccbf7 --- /dev/null +++ b/src/ge/graph/label/label_maker.h @@ -0,0 +1,65 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_LABEL_MAKER_H_ +#define GE_GRAPH_PASSES_LABEL_MAKER_H_ + +#include + +#include "graph/node.h" +#include "graph/label/label_maker_factory.h" +#include "framework/common/ge_inner_error_codes.h" + +namespace ge { +class LabelMaker { + public: + LabelMaker(const ComputeGraphPtr &graph, const NodePtr &owner) : parent_node_(owner), parent_graph_(graph) {} + + virtual ~LabelMaker() { + parent_node_ = nullptr; + parent_graph_ = nullptr; + } + + virtual Status Run(uint32_t &label_index) = 0; + + NodePtr AddLabelSetEnter(const ComputeGraphPtr &graph, const std::string &name, uint32_t index); + NodePtr AddLabelSetLeave(const ComputeGraphPtr &graph, const std::string &name, uint32_t index); + + NodePtr AddLabelGotoEnter(const ComputeGraphPtr &graph, const std::string &name, uint32_t index); + NodePtr AddLabelGotoLeave(const ComputeGraphPtr &graph, const std::string &name, uint32_t index); + + NodePtr AddLabelSwitchEnter(const ComputeGraphPtr &graph, const std::string &name, const GeTensorDesc &desc, + const std::vector &labels); + NodePtr AddLabelSwitchLeave(const ComputeGraphPtr &graph, const std::string &name, const GeTensorDesc &desc, + const std::vector &labels); + + NodePtr AddLabelSwitchIndex(const ComputeGraphPtr &graph, const std::string &name, const GeTensorDesc &desc, + const NodePtr &sw_node, uint32_t parent_index); + + LabelMaker &operator=(const LabelMaker &model) = delete; + LabelMaker(const LabelMaker &model) = delete; + + protected: + NodePtr parent_node_; + ComputeGraphPtr parent_graph_; + + private: + Status AddCtrlLink2Data(const ComputeGraphPtr &graph, const NodePtr &node); + void SetStreamIdEnter(const ComputeGraphPtr &graph, const OpDescPtr &op_desc); + void SetStreamIdLeave(const ComputeGraphPtr &graph, const OpDescPtr &op_desc); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_LABEL_MAKER_H_ diff --git a/src/ge/graph/label/label_maker_factory.h b/src/ge/graph/label/label_maker_factory.h new file mode 100644 index 00000000..0a87ec66 --- /dev/null +++ b/src/ge/graph/label/label_maker_factory.h @@ -0,0 +1,89 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_LABEL_MAKER_FACTORY_H_ +#define GE_GRAPH_PASSES_LABEL_MAKER_FACTORY_H_ + +#include +#include +#include +#include + +#include "common/ge/ge_util.h" +#include "framework/common/debug/ge_log.h" + +namespace ge { +class LabelMaker; +using LabelMakerPtr = std::shared_ptr; + +class LabelMakerFactory { + public: + // TaskManagerCreator function def + using LabelCreatorFun = std::function; + + static LabelMakerFactory &Instance() { + static LabelMakerFactory instance; + return instance; + } + + LabelMakerPtr Create(const std::string &node_type, const ComputeGraphPtr &graph, const NodePtr &node) { + auto it = creator_map_.find(node_type); + if (it == creator_map_.end()) { + GELOGW("Cannot find node type %s in map.", node_type.c_str()); + return nullptr; + } + + return it->second(graph, node); + } + + // LabelInfo registerar + class Registerar { + public: + Registerar(const std::string &node_type, const LabelCreatorFun func) { + LabelMakerFactory::Instance().RegisterCreator(node_type, func); + } + + ~Registerar() {} + }; + + private: + LabelMakerFactory() {} + + ~LabelMakerFactory() {} + + // register creator, this function will call in the constructor + void RegisterCreator(const std::string &node_type, const LabelCreatorFun func) { + auto it = creator_map_.find(node_type); + if (it != creator_map_.end()) { + GELOGD("LabelMarkFactory::RegisterCreator: %s creator already exist", node_type.c_str()); + return; + } + + creator_map_[node_type] = func; + } + + std::map creator_map_; +}; + +#define REGISTER_LABEL_MAKER(type, clazz) \ + LabelMakerPtr Creator_##type##_Label_Maker(const ComputeGraphPtr &graph, const NodePtr &node) { \ + std::shared_ptr maker = nullptr; \ + maker = MakeShared(graph, node); \ + return maker; \ + } \ + LabelMakerFactory::Registerar g_##type##_Label_Maker_Creator(type, Creator_##type##_Label_Maker); +} // namespace ge +#endif // GE_GRAPH_PASSES_LABEL_MAKER_FACTORY_H_ \ No newline at end of file diff --git a/src/ge/graph/label/partitioned_call_label_maker.cc b/src/ge/graph/label/partitioned_call_label_maker.cc new file mode 100644 index 00000000..39c88717 --- /dev/null +++ b/src/ge/graph/label/partitioned_call_label_maker.cc @@ -0,0 +1,58 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "partitioned_call_label_maker.h" + +#include "common/util.h" +#include "common/ge_inner_error_codes.h" +#include "framework/common/types.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +constexpr int32_t kSubGraphIndex = 0; + +/** + * @ingroup ge + * @brief Make label node to functional call. + * @param [in/out] label_index: serial id for whole graph. + * @return: 0 for success / others for fail + */ +Status PartitionedCallLabelMaker::Run(uint32_t &label_index) { + GE_CHECK_NOTNULL(parent_node_); + GE_CHECK_NOTNULL(parent_graph_); + + OpDescPtr call_desc = parent_node_->GetOpDesc(); + GE_CHECK_NOTNULL(call_desc); + + std::string sub_graph_name = call_desc->GetSubgraphInstanceName(kSubGraphIndex); + if (sub_graph_name.empty()) { + GELOGE(INTERNAL_ERROR, "Node: %s has no subgraph name.", sub_graph_name.c_str()); + return FAILED; + } + + ComputeGraphPtr sub_graph = parent_graph_->GetSubgraph(sub_graph_name); + if (sub_graph == nullptr) { + GELOGE(INTERNAL_ERROR, "Node: %s has no subgraph.", sub_graph_name.c_str()); + return FAILED; + } + + return SUCCESS; +} + +REGISTER_LABEL_MAKER(PARTITIONEDCALL, PartitionedCallLabelMaker); +REGISTER_LABEL_MAKER(STATEFULPARTITIONEDCALL, PartitionedCallLabelMaker); +} // namespace ge diff --git a/src/ge/graph/label/partitioned_call_label_maker.h b/src/ge/graph/label/partitioned_call_label_maker.h new file mode 100644 index 00000000..c78a06fc --- /dev/null +++ b/src/ge/graph/label/partitioned_call_label_maker.h @@ -0,0 +1,66 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_PARTITIONED_CALL_OP_LABEL_PASS_H_ +#define GE_GRAPH_PASSES_PARTITIONED_CALL_OP_LABEL_PASS_H_ + +#include "graph/node.h" +#include "graph/label/label_maker.h" +/******************************************************************************* + +---------------+ + | Node | + +---------------+ + | Node | + +---------------+ + |PartitionedCall| + +---------------+ + +---------------+ + | Node | +---------------+ + +---------------+ | f | + | Node | +---------------+ + +---------------+ | u | + |PartitionedCall| +---------------+ + +---------------+ | n | + | Node | ====> +---------------+ + +---------------+ | c | + | Node | +---------------+ + +---------------+ + | Node | +---------------+ + +---------------+ | Node | + +---------------+ + | Node | + +---------------+ +---------------+ + | f | | Node | + +---------------+ +---------------+ + | u | + +---------------+ + | n | + +---------------+ + | c | + +---------------+ +*******************************************************************************/ + +namespace ge { +class PartitionedCallLabelMaker : public LabelMaker { + public: + PartitionedCallLabelMaker(const ComputeGraphPtr &graph, const NodePtr &owner) : LabelMaker(graph, owner) {} + + ~PartitionedCallLabelMaker() override {} + + virtual Status Run(uint32_t &label_index); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_PARTITIONED_CALL_OP_LABEL_PASS_H_ \ No newline at end of file diff --git a/src/ge/graph/label/while_label_maker.cc b/src/ge/graph/label/while_label_maker.cc new file mode 100644 index 00000000..c9efccd5 --- /dev/null +++ b/src/ge/graph/label/while_label_maker.cc @@ -0,0 +1,126 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "while_label_maker.h" + +#include "common/util.h" +#include "common/ge_inner_error_codes.h" +#include "framework/common/types.h" +#include "framework/common/op/ge_op_utils.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +constexpr uint8_t kCondOutputNum = 1; +constexpr uint8_t kCondOutputIndex = 0; +constexpr uint8_t kCondBranchIndex = 0; +constexpr uint8_t kBodyBranchIndex = 1; + +/** + * @ingroup ge + * @brief Make label node to functional call. + * @param [in/out] label_index: serial id for whole graph. + * @return: 0 for success / others for fail + */ +Status WhileOpLabelMaker::Run(uint32_t &label_index) { + GE_CHECK_NOTNULL(parent_node_); + GE_CHECK_NOTNULL(parent_graph_); + + OpDescPtr while_desc = parent_node_->GetOpDesc(); + GE_CHECK_NOTNULL(while_desc); + + std::string cond_name = while_desc->GetSubgraphInstanceName(kCondBranchIndex); + std::string body_name = while_desc->GetSubgraphInstanceName(kBodyBranchIndex); + if (cond_name.empty() || body_name.empty()) { + GELOGE(INTERNAL_ERROR, "Node: %s has invalid subgraph, cond branch: %s, body branch: %s.", + while_desc->GetName().c_str(), cond_name.c_str(), body_name.c_str()); + return FAILED; + } + + ComputeGraphPtr cond_graph = parent_graph_->GetSubgraph(cond_name); + ComputeGraphPtr body_graph = parent_graph_->GetSubgraph(body_name); + GE_CHECK_NOTNULL(cond_graph); + GE_CHECK_NOTNULL(body_graph); + + const uint32_t cond_enter_index = label_index++; + const uint32_t body_enter_index = label_index++; + const uint32_t body_leave_index = label_index++; + const std::string cond_enter_name = parent_node_->GetName() + "/CondLabelSet"; // rtLabelSet + const std::string cond_leave_name = parent_node_->GetName() + "/LabelSwitch"; // rtLabelSwitchByIndex + const std::string body_enter_name = parent_node_->GetName() + "/EnterLabelSet"; // rtLabelSet + const std::string goto_leave_name = parent_node_->GetName() + "/LabelGoto"; // rtLabelGoto + const std::string body_leave_name = parent_node_->GetName() + "/LeaveLabelSet"; // rtLabelSet + + if (AddLabelSetEnter(cond_graph, cond_enter_name, cond_enter_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", cond_graph->GetName().c_str()); + return FAILED; + } + + if (AddLabelSetEnter(body_graph, body_enter_name, body_enter_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", body_graph->GetName().c_str()); + return FAILED; + } + + if (AddLabelGotoLeave(body_graph, goto_leave_name, cond_enter_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label goto failed.", body_graph->GetName().c_str()); + return FAILED; + } + + if (AddLabelSetLeave(body_graph, body_leave_name, body_leave_index) == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label set failed.", body_graph->GetName().c_str()); + return FAILED; + } + + NodePtr cond_out_node = cond_graph->FindNode(NODE_NAME_NET_OUTPUT); + GE_CHECK_NOTNULL(cond_out_node); + OpDescPtr cond_out_desc = cond_out_node->GetOpDesc(); + GE_CHECK_NOTNULL(cond_out_desc); + + GeTensorDesc pred_desc = cond_out_desc->GetInputDesc(kCondOutputIndex); + GeTensorDesc cond_desc(GeShape(pred_desc.GetShape().GetDims()), pred_desc.GetFormat(), DT_INT32); + + // false ==> 0 ==> switch_labels[0] ==> body_leave_index + // true ==> 1 ==> switch_labels[1] ==> body_enter_name + const std::vector switch_labels = {body_leave_index, body_enter_index}; + NodePtr switch_node = AddLabelSwitchLeave(cond_graph, cond_leave_name, cond_desc, switch_labels); + if (switch_node == nullptr) { + GELOGE(INTERNAL_ERROR, "Subgraph: %s add label switch failed.", cond_graph->GetName().c_str()); + return FAILED; + } + + // link Data input. + const auto &all_in_data = cond_out_node->GetAllInDataAnchors(); + if (all_in_data.size() != kCondOutputNum) { + GELOGE(FAILED, "Node: %s Cond sbugraph output size:%zu should equal size:%u.", switch_node->GetName().c_str(), + all_in_data.size(), kCondOutputNum); + return FAILED; + } + + InDataAnchorPtr in_anchor = all_in_data.at(kCondOutputIndex); + GE_CHECK_NOTNULL(in_anchor); + if (GraphUtils::AddEdge(in_anchor->GetPeerOutAnchor(), switch_node->GetInDataAnchor(kCondOutputIndex)) != SUCCESS) { + GELOGE(FAILED, "Node: %s Add pred data input failed.", switch_node->GetName().c_str()); + return FAILED; + } + + GELOGI("Node: %s assign label success.", while_desc->GetName().c_str()); + return SUCCESS; +} + +REGISTER_LABEL_MAKER(WHILE, WhileOpLabelMaker); +REGISTER_LABEL_MAKER(_WHILE, WhileOpLabelMaker); +REGISTER_LABEL_MAKER(STATELESSWHILE, WhileOpLabelMaker); +} // namespace ge diff --git a/src/ge/graph/label/while_label_maker.h b/src/ge/graph/label/while_label_maker.h new file mode 100644 index 00000000..ea7787a2 --- /dev/null +++ b/src/ge/graph/label/while_label_maker.h @@ -0,0 +1,80 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_WHILE_OP_LABEL_PASS_H_ +#define GE_GRAPH_PASSES_WHILE_OP_LABEL_PASS_H_ + +#include "graph/node.h" +#include "graph/label/label_maker.h" +/******************************************************************************* + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ + | While | + +-----------+ + +-----------+ + | Node | +-----------+ + +-----------+ | LabelSet |\ + | Node | +-----------+ \ + +-----------+ | c | \ + | While | +-----------+ A + +-----------+ | o | | + | Node | +-----------+ | + +-----------+ | n | | + | Node | +-----------+ | + +-----------+ | d | | + | Node | +-----------+ | + +-----------+ /|SwitchByIdx| | + / +-----------+ | + ====> / | + | \ +-----------+ | + | \|LabelSet(1)| | + | +-----------+ | + +-----------+ +-----------+ | | b | | + | c | | b | | +-----------+ | + +-----------+ +-----------+ | | o | | + | o | | o | | +-----------+ | + +-----------+ +-----------+ | | d | | + | n | | d | | +-----------+ | + +-----------+ +-----------+ | | y | / + | d | | y | V +-----------+ / + +-----------+ +-----------+ \ | LabelGoto |/ + \ +-----------+ + \|LabelSet(0)| + +-----------+ + + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ + | Node | + +-----------+ +*******************************************************************************/ + +namespace ge { +class WhileOpLabelMaker : public LabelMaker { + public: + WhileOpLabelMaker(const ComputeGraphPtr &graph, const NodePtr &owner) : LabelMaker(graph, owner) {} + + ~WhileOpLabelMaker() override {} + + virtual Status Run(uint32_t &label_index); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_WHILE_OP_LABEL_PASS_H_ diff --git a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc new file mode 100644 index 00000000..c3de44c9 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc @@ -0,0 +1,372 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/load/new_model_manager/cpu_queue_schedule.h" +#include "common/debug/ge_log.h" + +namespace { +const uint32_t kCoreDim = 1; // for rtCpuKernelLaunch +const char *const kCpuTaskModelEnqueue = "modelEnqueue"; +const char *const kCpuTaskPrepareInput = "modelPrepareInput"; +const char *const kCpuTaskWaitEndGraph = "modelWaitEndGraph"; +const char *const kCpuTaskPrepareOutput = "modelPrepareOutput"; +const char *const kCpuTaskModelDequeue = "modelDequeue"; +const char *const kCpuTaskModelRepeat = "modelRepeat"; +} // namespace + +namespace ge { +CpuTaskInfo::CpuTaskInfo(rtStream_t stream) : args_(nullptr), args_size_(0) { stream_ = stream; } + +CpuTaskInfo::~CpuTaskInfo() { + if (args_ == nullptr) { + return; + } + + rtError_t status = rtFree(args_); + if (status != RT_ERROR_NONE) { + GELOGW("Call rt free failed, status: 0x%x", status); + } + args_ = nullptr; +} +/// +/// @ingroup ge +/// @brief definiteness queue schedule, bind input queue to task. +/// @param [in] queue_id: input queue id from user. +/// @param [out] in_mbuf: input mbuf addr for input data. +/// @return: 0 for success / others for failed +/// +Status CpuTaskModelDequeue::Init(uint32_t queue_id, uintptr_t &in_mbuf) { + if ((args_ != nullptr) || (args_size_ > 0)) { + GELOGE(FAILED, "Task already initialized, size: %u", args_size_); + return FAILED; + } + + args_size_ = sizeof(MbufQueueInfo) + sizeof(uintptr_t); // sizeof(uintptr_t) for save in_mbuf. + rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + in_mbuf = reinterpret_cast(args_) + sizeof(MbufQueueInfo); + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) + + MbufQueueInfo queue_info; + queue_info.queue_id = queue_id; + queue_info.in_mbuf = in_mbuf; // Placeholder, input mbuf addr will save to this place. + status = rtMemcpy(args_, args_size_, &queue_info, sizeof(MbufQueueInfo), RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + return SUCCESS; +} + +Status CpuTaskModelDequeue::Distribute() { + if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskModelDequeue, kCoreDim, args_, args_size_, nullptr, stream_); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ModelDequeue failed, status: 0x%X", status); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch model dequeue task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, bind output queue to task. +/// @param [in] addr: NetOutput Op input tensor address. +/// @param [in] size: NetOutput Op input tensor size. +/// @param [in] in_mbuf: input mbuf addr for input data. +/// @return: 0 for success / others for failed +/// +Status CpuTaskPrepareInput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf) { + if ((args_ != nullptr) || (args_size_ > 0)) { + GELOGE(FAILED, "Task already initialized, size: %u", args_size_); + return FAILED; + } + + args_size_ = sizeof(PrepareInputInfo); + rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) + + PrepareInputInfo prepare; + prepare.in_mbuf = in_mbuf; + prepare.mbuf_offset = 0; + prepare.data_size = size; + prepare.data_addr = addr; + status = rtMemcpy(args_, args_size_, &prepare, args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + return SUCCESS; +} + +Status CpuTaskPrepareInput::Distribute() { + if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskPrepareInput, kCoreDim, args_, args_size_, nullptr, stream_); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch PrepareInput failed, status: 0x%X", status); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch prepare input task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, bind output queue to task. +/// @param [in] addr: NetOutput Op input tensor address. +/// @param [in] size: NetOutput Op input tensor size. +/// @param [in] in_mbuf: input mbuf addr for input data. +/// @param [out] out_mbuf: output mbuf addr for output data. +/// @return: 0 for success / others for failed +/// +Status CpuTaskPrepareOutput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf, uintptr_t &out_mbuf) { + if ((args_ != nullptr) || (args_size_ > 0)) { + GELOGE(FAILED, "Task already initialized, size: %u", args_size_); + return FAILED; + } + + args_size_ = sizeof(PrepareOutputInfo) + sizeof(uintptr_t); // sizeof(uintptr_t) for save out_mbuf. + rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + out_mbuf = reinterpret_cast(args_) + sizeof(PrepareOutputInfo); + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) + + // Get NetOutput Input address and bind to queue. + PrepareOutputInfo prepare; + prepare.data_size = size; + prepare.data_addr = addr; + prepare.in_mbuf = in_mbuf; + prepare.out_mbuf = out_mbuf; // Placeholder, output mbuf addr will save to this place. + status = rtMemcpy(args_, args_size_, &prepare, sizeof(PrepareOutputInfo), RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + return SUCCESS; +} + +Status CpuTaskPrepareOutput::Distribute() { + if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskPrepareOutput, kCoreDim, args_, args_size_, nullptr, stream_); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch PrepareOutput failed, status: 0x%X", status); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch prepare output task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, bind output queue to task. +/// @param [in] queue_id: output queue id from user. +/// @param [in] out_mbuf: mbuf for output data. +/// @return: 0 for success / others for failed +/// +Status CpuTaskModelEnqueue::Init(uint32_t queue_id, uintptr_t out_mbuf) { + if ((args_ != nullptr) || (args_size_ > 0)) { + GELOGE(FAILED, "Task already initialized, size: %u", args_size_); + return FAILED; + } + + // Get NetOutput Input address and bind to queue. + args_size_ = sizeof(MbufQueueInfo); + rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) + + MbufQueueInfo queue_info; + queue_info.queue_id = queue_id; + queue_info.in_mbuf = out_mbuf; + status = rtMemcpy(args_, args_size_, &queue_info, args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + return SUCCESS; +} + +Status CpuTaskModelEnqueue::Distribute() { + if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskModelEnqueue, kCoreDim, args_, args_size_, nullptr, stream_); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ModelEnqueue failed, status: 0x%X", status); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch model enqueue task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, active entry stream. +/// @param [in] stream: stream to be active. +/// @return: 0 for success / others for failed +/// +Status CpuTaskActiveEntry::Init(rtStream_t stream) { + if (stream == nullptr) { + GELOGE(FAILED, "Task active stream not valid"); + return FAILED; + } + + active_stream_ = stream; + return SUCCESS; +} + +Status CpuTaskActiveEntry::Distribute() { + if ((active_stream_ == nullptr) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t ret = rtStreamActive(active_stream_, stream_); + if (ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt StreamActive failed, ret: 0x%X", ret); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch wait end task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, wait for end graph. +/// @param [in] model_id: model id for wait end graph. +/// @return: 0 for success / others for failed +/// +Status CpuTaskWaitEndGraph::Init(uint32_t model_id) { + if ((args_ != nullptr) || (args_size_ > 0)) { + GELOGE(FAILED, "Task already initialized, size: %u", args_size_); + return FAILED; + } + + args_size_ = sizeof(model_id); + rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) + + status = rtMemcpy(args_, args_size_, &model_id, args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + return SUCCESS; +} + +Status CpuTaskWaitEndGraph::Distribute() { + if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskWaitEndGraph, kCoreDim, args_, args_size_, nullptr, stream_); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch WaitEndGraph failed, status: 0x%X", status); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch wait end task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, repeat run model. +/// @param [in] model_id: model id for repeat run. +/// @return: 0 for success / others for failed +/// +Status CpuTaskModelRepeat::Init(uint32_t model_id) { + if ((args_ != nullptr) || (args_size_ > 0)) { + GELOGE(FAILED, "Task already initialized, size: %u", args_size_); + return FAILED; + } + + args_size_ = sizeof(model_id); + rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) + + status = rtMemcpy(args_, args_size_, &model_id, args_size_, RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + return SUCCESS; +} + +Status CpuTaskModelRepeat::Distribute() { + if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { + GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); + return FAILED; + } + + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskModelRepeat, kCoreDim, args_, args_size_, nullptr, stream_); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ModelRepeat failed, status: 0x%x", status); + return RT_FAILED; + } + + GELOGI("Cpu kernel launch repeat task success."); + return SUCCESS; +} +} // namespace ge diff --git a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h new file mode 100644 index 00000000..8a9af63f --- /dev/null +++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h @@ -0,0 +1,168 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_CPU_QUEUE_SCHEDULE_H_ +#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_CPU_QUEUE_SCHEDULE_H_ + +#include +#include + +#include "common/ge_inner_error_codes.h" +#include "graph/load/new_model_manager/task_info/task_info.h" +#include "runtime/kernel.h" + +namespace ge { +// For AICPU task "modelDequeue" / "modelEnqueue" +struct MbufQueueInfo { + uint32_t queue_id; // Op queue id + uintptr_t in_mbuf; // addr for input mbuf +}; + +// For AICPU task "modelPrepareInput" +struct PrepareInputInfo { + uintptr_t in_mbuf; // input mbuf from dequeue + uint32_t mbuf_offset; // offset of mbuf(current is 0) + uint32_t data_size; // input Tensor size + uintptr_t data_addr; // input Tensor addr +}; + +// For AICPU task "modelPrepareOutput" +struct PrepareOutputInfo { + uint32_t data_size; // output Tensor size + uintptr_t data_addr; // output Tensor addr + uintptr_t in_mbuf; // input mbuf, for fill output mbuf header + uintptr_t out_mbuf; // output mbuf addr +}; + +/// +/// @ingroup ge +/// @brief CpuTask base, inherit from TaskInfo used for manage. +/// +class CpuTaskInfo : public TaskInfo { + public: + explicit CpuTaskInfo(rtStream_t stream); + ~CpuTaskInfo() override; + + protected: + void *args_; + uint32_t args_size_; +}; + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, bind input queue to task. +/// +class CpuTaskModelDequeue : public CpuTaskInfo { + public: + explicit CpuTaskModelDequeue(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskModelDequeue() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(uint32_t queue_id, uintptr_t &in_mbuf); + + Status Distribute() override; +}; + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, bind output queue to task. +/// +class CpuTaskPrepareInput : public CpuTaskInfo { + public: + explicit CpuTaskPrepareInput(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskPrepareInput() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf); + + Status Distribute() override; +}; + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, active original model stream. +/// +class CpuTaskPrepareOutput : public CpuTaskInfo { + public: + explicit CpuTaskPrepareOutput(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskPrepareOutput() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf, uintptr_t &out_mbuf); + + Status Distribute() override; +}; + +class CpuTaskModelEnqueue : public CpuTaskInfo { + public: + explicit CpuTaskModelEnqueue(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskModelEnqueue() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(uint32_t queue_id, uintptr_t out_mbuf); + + Status Distribute() override; +}; + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, active entry stream. +/// +class CpuTaskActiveEntry : public CpuTaskInfo { + public: + explicit CpuTaskActiveEntry(rtStream_t stream) : CpuTaskInfo(stream), active_stream_(nullptr) {} + ~CpuTaskActiveEntry() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(rtStream_t stream); + + Status Distribute() override; + + private: + rtStream_t active_stream_; +}; + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, wait for end graph. +/// +class CpuTaskWaitEndGraph : public CpuTaskInfo { + public: + explicit CpuTaskWaitEndGraph(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskWaitEndGraph() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(uint32_t model_id); + + Status Distribute() override; +}; + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, repeat run model. +/// +class CpuTaskModelRepeat : public CpuTaskInfo { + public: + explicit CpuTaskModelRepeat(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskModelRepeat() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } + Status Init(uint32_t model_id); + + Status Distribute() override; +}; +} // namespace ge +#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_CPU_QUEUE_SCHEDULE_H_ diff --git a/src/ge/graph/load/new_model_manager/data_dumper.cc b/src/ge/graph/load/new_model_manager/data_dumper.cc index 659ffbf8..824f6b18 100644 --- a/src/ge/graph/load/new_model_manager/data_dumper.cc +++ b/src/ge/graph/load/new_model_manager/data_dumper.cc @@ -14,21 +14,20 @@ * limitations under the License. */ +#include "graph/load/new_model_manager/data_dumper.h" +#include #include #include -#include - -#include "graph/load/new_model_manager/data_dumper.h" -#include "graph/utils/attr_utils.h" -#include "graph/debug/ge_attr_define.h" -#include "framework/common/debug/ge_log.h" -#include "proto/op_mapping_info.pb.h" -#include "proto/ge_ir.pb.h" -#include "runtime/mem.h" #include "common/properties_manager.h" +#include "framework/common/debug/ge_log.h" #include "framework/common/util.h" -#include "model_utils.h" #include "graph/anchor.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/attr_utils.h" +#include "graph/load/new_model_manager/model_utils.h" +#include "proto/ge_ir.pb.h" +#include "proto/op_mapping_info.pb.h" +#include "runtime/mem.h" namespace { const uint32_t kAicpuLoadFlag = 1; @@ -116,6 +115,7 @@ void DataDumper::SaveDumpInput(const std::shared_ptr &node) { GELOGE(PARAM_INVALID, "input op desc is null."); return; } + input_map_.insert( {op_desc->GetName(), {input_op_desc, dst_in_data_anchor->GetIdx(), out_data_anchor->GetIdx()}}); } @@ -140,18 +140,18 @@ void DataDumper::SaveDumpTask(uint32_t task_id, const std::shared_ptr &o return; } - auto output_tensor = data_op->GetOutputDescPtr(inner_input_mapping.output_anchor_index); - if (output_tensor == nullptr) { - GELOGE(PARAM_INVALID, "output_tensor is null, index: %d, size: %zu.", inner_input_mapping.output_anchor_index, - data_op->GetOutputsSize()); + auto input_tensor = op_desc->GetInputDescPtr(inner_input_mapping.input_anchor_index); + if (input_tensor == nullptr) { + GELOGE(PARAM_INVALID, "input_tensor is null, index: %d, size: %zu.", inner_input_mapping.input_anchor_index, + op_desc->GetInputsSize()); return; } - uintptr_t data_addr = args - sizeof(void *) * data_op->GetInputOffset().size() + - sizeof(void *) * static_cast(inner_input_mapping.output_anchor_index); + uintptr_t data_addr = args - sizeof(void *) * op_desc->GetInputOffset().size() + + sizeof(void *) * static_cast(inner_input_mapping.input_anchor_index); GELOGI("Save input dump task %s, id: %u.", data_op->GetName().c_str(), task_id); op_list_.push_back({task_id, data_op, data_addr, false, inner_input_mapping.input_anchor_index, - inner_input_mapping.output_anchor_index}); + inner_input_mapping.output_anchor_index, input_tensor->GetShape().GetDims()}); } } @@ -180,25 +180,28 @@ static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uin } Status DataDumper::LoadDumpInfo() { - GELOGI("%zu op need dump in %s.", op_list_.size(), model_name_.c_str()); + PrintCheckLog(); + if (op_list_.empty()) { return SUCCESS; } aicpu::dump::OpMappingInfo op_mapping_info; - op_mapping_info.set_dump_path(PropertiesManager::Instance().GetDumpOutputPath() + std::to_string(device_id_) + "/"); - op_mapping_info.set_model_name(model_name_); op_mapping_info.set_model_id(model_id_); op_mapping_info.set_flag(kAicpuLoadFlag); + op_mapping_info.set_dump_step(PropertiesManager::Instance().GetDumpStep()); SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info); + GELOGD("Dump step in load dump info is %s", PropertiesManager::Instance().GetDumpStep().c_str()); for (const auto &op_iter : op_list_) { aicpu::dump::Task task; + auto op_desc = op_iter.op; + task.set_end_graph(op_desc->GetType() == ENDGRAPH); task.set_task_id(op_iter.task_id); - task.mutable_op()->set_op_name(op_iter.op->GetName()); - task.mutable_op()->set_op_type(op_iter.op->GetType()); + task.mutable_op()->set_op_name(op_desc->GetName()); + task.mutable_op()->set_op_type(op_desc->GetType()); if (op_iter.is_task) { // tbe or aicpu op @@ -249,7 +252,7 @@ Status DataDumper::LoadDumpInfo() { output.set_data_type(static_cast(GetIrDataType(output_tensor->GetDataType()))); output.set_format(static_cast(output_tensor->GetFormat())); - for (auto dim : output_tensor->GetShape().GetDims()) { + for (auto dim : op_iter.dims) { output.mutable_shape()->add_dim(dim); } @@ -270,10 +273,10 @@ Status DataDumper::LoadDumpInfo() { } std::string proto_str; - uint32_t proto_size = op_mapping_info.ByteSizeLong(); + size_t proto_size = op_mapping_info.ByteSizeLong(); bool ret = op_mapping_info.SerializeToString(&proto_str); if (!ret || proto_size == 0) { - GELOGE(FAILED, "Protobuf SerializeToString failed, proto size %u.", proto_size); + GELOGE(FAILED, "Protobuf SerializeToString failed, proto size %zu.", proto_size); return FAILED; } @@ -287,6 +290,7 @@ Status DataDumper::LoadDumpInfo() { GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_FAILED; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "load dump information.", proto_size) rt_ret = rtMemcpy(dev_mem_load_, proto_size, proto_str.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { @@ -340,6 +344,7 @@ Status DataDumper::UnloadDumpInfo() { GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret); return RT_FAILED; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "unload dump information.", proto_size) rt_ret = rtMemcpy(dev_mem_unload_, proto_size, proto_str.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { @@ -352,9 +357,42 @@ Status DataDumper::UnloadDumpInfo() { GELOGE(RT_FAILED, "Call rtDatadumpInfoLoad failed, ret: 0x%X", rt_ret); return RT_FAILED; } - load_flag_ = false; GELOGI("UnloadDumpInfo success, proto size: %zu.", proto_size); return SUCCESS; } + +void DataDumper::PrintCheckLog() { + std::set model_list = PropertiesManager::Instance().GetAllDumpModel(); + if (model_list.empty()) { + GELOGI("No model need dump."); + return; + } + + GELOGI("%zu op need dump in %s.", op_list_.size(), model_name_.c_str()); + if (model_list.find(ge::DUMP_ALL_MODEL) == model_list.end()) { + if (model_list.find(model_name_) == model_list.end()) { + std::string model_list_str; + for (auto &model : model_list) { + model_list_str += "[" + model + "]."; + } + + GELOGW("Model %s not be set to dump, dump list: %s", model_name_.c_str(), model_list_str.c_str()); + return; + } + } + + std::set config_dump_op_list = PropertiesManager::Instance().GetDumpPropertyValue(model_name_); + std::set dump_op_list; + for (auto &inner_dump_info : op_list_) { + // oplist value OpDescPtr is not nullptr + dump_op_list.insert(inner_dump_info.op->GetName()); + } + + for (auto &dump_op : config_dump_op_list) { + if (dump_op_list.find(dump_op) == dump_op_list.end()) { + GELOGW("Op %s set to dump but not exist in model %s or not a valid op.", dump_op.c_str(), model_name_.c_str()); + } + } +} } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/data_dumper.h b/src/ge/graph/load/new_model_manager/data_dumper.h index 128e18ee..4400b127 100644 --- a/src/ge/graph/load/new_model_manager/data_dumper.h +++ b/src/ge/graph/load/new_model_manager/data_dumper.h @@ -17,10 +17,10 @@ #ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DATA_DUMPER_H_ #define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DATA_DUMPER_H_ -#include +#include #include +#include #include -#include #include "framework/common/ge_inner_error_codes.h" #include "graph/node.h" @@ -59,6 +59,7 @@ class DataDumper { private: void ReleaseDevMem(void **ptr) noexcept; + void PrintCheckLog(); std::string model_name_; uint32_t model_id_; @@ -85,6 +86,7 @@ struct DataDumper::InnerDumpInfo { bool is_task; int input_anchor_index; int output_anchor_index; + std::vector dims; }; struct DataDumper::InnerInputMapping { @@ -92,7 +94,6 @@ struct DataDumper::InnerInputMapping { int input_anchor_index; int output_anchor_index; }; - } // namespace ge #endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DATA_DUMPER_H_ diff --git a/src/ge/graph/load/new_model_manager/data_inputer.h b/src/ge/graph/load/new_model_manager/data_inputer.h index 7e396807..cc511c36 100644 --- a/src/ge/graph/load/new_model_manager/data_inputer.h +++ b/src/ge/graph/load/new_model_manager/data_inputer.h @@ -22,8 +22,8 @@ #include #include "common/blocking_queue.h" -#include "common/types.h" #include "common/ge_types.h" +#include "common/types.h" namespace ge { /// diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc index 236b934a..7b743f3c 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.cc +++ b/src/ge/graph/load/new_model_manager/davinci_model.cc @@ -16,15 +16,15 @@ #include "graph/load/new_model_manager/davinci_model.h" +#include #include #include #include #include #include -#include #include +#include #include -#include #include "common/debug/log.h" #include "common/formats/formats.h" @@ -40,8 +40,9 @@ #include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" #include "graph/graph.h" -#include "graph/load/output/output.h" +#include "graph/load/new_model_manager/cpu_queue_schedule.h" #include "graph/load/new_model_manager/tbe_handle_store.h" +#include "graph/load/output/output.h" #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/graph_var_manager.h" #include "graph/manager/util/debug.h" @@ -58,6 +59,7 @@ #include "runtime/event.h" #include "runtime/mem.h" #include "runtime/stream.h" +#include "securec.h" // create std::thread, catch exceptions using try/catch #define CREATE_STD_THREAD(thread_id, func, args) \ @@ -73,11 +75,13 @@ namespace ge { namespace { -const uint32_t DEFAULT_DATA_INDEX = 0; -const uint32_t TRUE_BRANCH_STREAM_NUM = 1; -const uint32_t THREAD_NUM = 16; +const uint32_t kDataIndex = 0; +const uint32_t kTrueBranchStreamNum = 1; +const uint32_t kThreadNum = 16; const int kDecimal = 10; const int kBytes = 8; +const uint32_t kDataMemAlignSizeCompare = 64; +const char *const kDefaultBatchLable = "Batch_default"; class RtContextSwitchGuard { public: @@ -104,15 +108,9 @@ class RtContextSwitchGuard { ~RtContextSwitchGuard() { if (current_ != nullptr) { auto ret = rtCtxDestroy(current_); - if (ret != RT_ERROR_NONE) { - GELOGW("Failed to call rtCtxDestroy"); - } } if (last_ != nullptr) { auto ret = rtCtxSetCurrent(last_); - if (ret != RT_ERROR_NONE) { - GELOGW("Failed to call rtCtxSetCurrent"); - } } } @@ -121,17 +119,17 @@ class RtContextSwitchGuard { rtContext_t current_; }; -int CalcVarSizeInBytes(const GeTensorDesc &desc) { - int var_size = GetSizeByDataType(desc.GetDataType()); +int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) { + int64_t var_size = GetSizeByDataType(desc.GetDataType()); if (var_size <= 0) { GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s", TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str()); return -1; } auto shape = desc.GetShape(); - auto dimNum = shape.GetDimNum(); - for (size_t dimIndex = 0; dimIndex < dimNum; ++dimIndex) { - var_size *= static_cast(shape.GetDim(dimIndex)); + auto dim_num = shape.GetDimNum(); + for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) { + var_size *= shape.GetDim(dim_index); } return var_size; } @@ -151,18 +149,18 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); if (var_addr == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to copy var %s from device, can not get var addr", var->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to copy var %s from device, cant not get var addr", var->GetName().c_str()); return INTERNAL_ERROR; } - int var_size_bytes = CalcVarSizeInBytes(input_desc); + int64_t var_size_bytes = CalcVarSizeInBytes(input_desc); if (var_size_bytes <= 0) { return INTERNAL_ERROR; } std::unique_ptr var_host(new (std::nothrow) uint8_t[var_size_bytes]); if (var_host == nullptr) { - GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %d", var_size_bytes); + GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes); return OUT_OF_MEMORY; } @@ -170,20 +168,19 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt var_size_bytes, RT_MEMCPY_DEVICE_TO_HOST); if (ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, - "Failed to copy var memory from device, var %s, size %d," + "Failed to copy var memory from device, var %s, size %ld," " rt-error-code %u", var->GetName().c_str(), var_size_bytes, ret); return RT_FAILED; } - GELOGD("Copy var %s from device to host, size %d", var->GetName().c_str(), var_size_bytes); + GELOGD("Copy var %s from device to host, size %ld", var->GetName().c_str(), var_size_bytes); var_data.swap(var_host); return SUCCESS; } Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_result, void *var_addr) { - GE_CHECK_NOTNULL(var); GELOGD("Copy var %s from host to device, size %zu", var->GetName().c_str(), trans_result.length); auto ret = rtMemcpy(var_addr, trans_result.length, reinterpret_cast(trans_result.data.get()), trans_result.length, RT_MEMCPY_HOST_TO_DEVICE); @@ -195,7 +192,7 @@ Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_res } Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats::TransResult &result) { - formats::TransResult resultLastTime{}; + formats::TransResult result_last_time{}; bool use_init_data = true; for (const auto &trans_info : trans_road) { if (trans_info.node_type == RESHAPE || trans_info.node_type == REFORMAT) { @@ -207,7 +204,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats src_data = var_data; use_init_data = false; } else { - src_data = resultLastTime.data.get(); + src_data = result_last_time.data.get(); } formats::TransResult tmp_result{}; @@ -254,14 +251,13 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats trans_info.node_type.c_str()); return UNSUPPORTED; } - resultLastTime = tmp_result; + result_last_time = tmp_result; } - result = resultLastTime; + result = result_last_time; return SUCCESS; } -/// /// re-alloc var memory on device using var-manager /// free origin var memory(var manager does not support now) /// @param session_id @@ -269,7 +265,6 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats /// @param var_size_bytes /// @param var_device /// @return -/// Status ReAssignVarAddr(uint64_t session_id, const std::string &var_name, const GeTensorDesc &tensor_desc, void **var_device) { uint8_t *var_logic = nullptr; @@ -292,16 +287,16 @@ Status ReAssignVarAddr(uint64_t session_id, const std::string &var_name, const G return SUCCESS; } -Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t session_id, uint32_t device_id) { +Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t session_id) { // do not need to do anything if only all reshape/reformat node on the trans_road GE_CHECK_NOTNULL(var); bool need_trans = false; - if (std::any_of(trans_road.begin(), trans_road.end(), [](const ge::TransNodeInfo &road) { - return road.node_type != RESHAPE && road.node_type != REFORMAT; - })) { - need_trans = true; + for (auto &road : trans_road) { + if (road.node_type != RESHAPE && road.node_type != REFORMAT) { + need_trans = true; + break; + } } - if (!need_trans) { return SUCCESS; } @@ -326,12 +321,11 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t } void *var_device = nullptr; - /// + /// It is a temporary solution to use the last GeTensorDesc to assign variable memory because the variable manager /// depends on TensorDesc and it is difficult to be modified. The correct solution is to assign memory based on the /// size of the converted variable. To complete the final solution, the dependency of the variable manager on /// TensorDesc needs to be removed. This change is large and needs to be performed step by step. - /// ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device); if (ret != SUCCESS) { GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length); @@ -347,8 +341,38 @@ Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t return SUCCESS; } + +bool CheckDynamicBatchZeroCopyAddr(const void *addr, const vector &dynamic_input_addrs, + const vector &fix_input_addrs) { + if (fix_input_addrs.empty()) { + if (!dynamic_input_addrs.empty() && + std::find(dynamic_input_addrs.begin(), dynamic_input_addrs.end(), addr) == dynamic_input_addrs.end()) { + return false; + } + } else { + if (!dynamic_input_addrs.empty() && + std::find(dynamic_input_addrs.begin(), dynamic_input_addrs.end(), addr) == dynamic_input_addrs.end() && + std::find(fix_input_addrs.begin(), fix_input_addrs.end(), addr) == fix_input_addrs.end()) { + return false; + } + } + return true; +} + +inline bool IsDataOp(const std::string &node_type) { + return node_type == DATA_TYPE || node_type == AIPP_DATA_TYPE || node_type == ANN_DATA_TYPE; +} +inline bool IsCallDumpInputOp(const OpDescPtr &op_desc) { + bool skip_task_generate = false; + (void)ge::AttrUtils::GetBool(op_desc, ATTR_NO_TASK_AND_DUMP_NEEDED, skip_task_generate); + return skip_task_generate; +} + } // namespace +SysMode DavinciModel::mode_ = INFERENCE; +std::mutex DavinciModel::mutex_mode_; + std::mutex DavinciModel::tvm_bin_mutex_; std::set DavinciModel::tvm_bin_kernel_; @@ -359,9 +383,13 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptrRelease(), "Release task failed."); + } + } + cpu_task_list_.clear(); + for (const auto &task : task_list_) { if (task != nullptr) { - GE_CHK_STATUS(task->Release()); + GE_CHK_STATUS(task->Release(), "Release task failed."); } } } @@ -457,7 +497,7 @@ Status DavinciModel::Assign(const GeModelPtr &ge_model) { return SUCCESS; } -Status DavinciModel::InitModelMem(void *dev_ptr, size_t memsize, void *weight_ptr, size_t weight_size) { +Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) { if (is_model_has_inited_) { GELOGI("call InitModelMem more than once ."); return FAILED; @@ -471,8 +511,8 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t memsize, void *weight_pt GE_CHECK_LE(weights_size, ALLOC_MEMORY_MAX_SIZE); - if ((dev_ptr != nullptr) && (memsize < TotalMemSize())) { - GELOGE(FAILED, "Invalid mem param: memsize=%zu totalsize=%zu.", memsize, TotalMemSize()); + if ((dev_ptr != nullptr) && (mem_size < TotalMemSize())) { + GELOGE(FAILED, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize()); return FAILED; } @@ -538,28 +578,33 @@ void DavinciModel::InitRuntimeParams() { ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_STREAM_NUM, value); runtime_param_.stream_num = ret ? (uint32_t)value : 0; ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_EVENT_NUM, value); - runtime_param_.event_num = ret ? (uint64_t)value : 0; + runtime_param_.event_num = ret ? (uint32_t)value : 0; + ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_LABEL_NUM, value); + runtime_param_.label_num = ret ? (uint32_t)value : 0; ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_BATCH_NUM, value); runtime_param_.batch_num = ret ? (uint32_t)value : 0; ret = ge::AttrUtils::GetInt(ge_model_, MODEL_ATTR_TASK_GEN_BASE_ADDR, value); runtime_param_.logic_mem_base = ret ? (uint64_t)value : 0; ret = ge::AttrUtils::GetInt(ge_model_, MODEL_ATTR_TASK_GEN_WEIGHT_ADDR, value); runtime_param_.logic_weight_base = ret ? (uint64_t)value : 0; - ret = ge::AttrUtils::GetInt(ge_model_, MODEL_ATTR_SESSION_ID, value); + ret = ge::AttrUtils::GetInt(ge_model_, ge::MODEL_ATTR_SESSION_ID, value); runtime_param_.session_id = ret ? (uint64_t)value : 0; ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_TASK_GEN_VAR_ADDR, value); runtime_param_.logic_var_base = ret ? (uint64_t)value : 0; ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_VAR_SIZE, value); runtime_param_.var_size = ret ? (uint64_t)value : 0; session_id_ = runtime_param_.session_id; - GELOGI("Init(),memory_size:%lu, weight_size:%lu, stream_num:%u, session_id:%lu, var_size:%lu.", + GELOGI("InitRuntimeParams(), memory_size:%lu, weight_size:%lu, stream_num:%u, session_id:%u, var_size:%lu.", runtime_param_.mem_size, runtime_param_.weight_size, runtime_param_.stream_num, runtime_param_.session_id, runtime_param_.var_size); - GELOGI("Init(),event_num:%u, batch_num:%u", runtime_param_.event_num, runtime_param_.batch_num); + GELOGI("InitRuntimeParams(), event_num:%u, label_num:%u", runtime_param_.event_num, runtime_param_.label_num); } void DavinciModel::CheckHasHcomOp() { + // definiteness queue schedule, all stream by TS. + GE_IF_BOOL_EXEC(!input_queue_ids_.empty() || !output_queue_ids_.empty(), return ); + Graph graph = ge_model_->GetGraph(); auto compute_graph = GraphUtils::GetComputeGraph(graph); if (compute_graph == nullptr) { @@ -586,8 +631,8 @@ Status DavinciModel::DoTaskSink() { if (model_task_def_) { GELOGI("do task_sink."); - // create model_handle to load model - GE_CHK_RT_RET(rtModelCreate(&rt_model_handle_, 0)); + // will adjust stream indication, load fist. + GE_CHK_STATUS_RET(LoadWithQueue(), "LoadWithQueue failed."); for (size_t i = 0; i < stream_list_.size(); i++) { GE_IF_BOOL_EXEC(active_stream_indication_.count(i) > 0, GELOGI("rtModelBindStream[%zu]", i); @@ -603,11 +648,43 @@ Status DavinciModel::DoTaskSink() { GE_CHK_RT_RET(rtModelLoadComplete(rt_model_handle_)); } + + for (const auto &addrs : input_outside_addrs_) { + const auto &used_list = addrs.second; + if (used_list.empty()) { + GELOGI("Not sinked data found, disable input zero copy."); + input_use_zero_copy_ = false; + break; + } + } + + for (const auto &addrs : output_outside_addrs_) { + const auto &used_list = addrs.second; + if (used_list.empty()) { + GELOGI("Not sinked data found, disable output zero copy."); + output_use_zero_copy_ = false; + break; + } + } + return SUCCESS; +} + +// set device use aicore(0) or vectorcore(1) +Status DavinciModel::SetTSDevice() { + int64_t value = 0; + bool ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_CORE_TYPE, value); + uint32_t core_type = ret ? static_cast(value) : 0; + GELOGI("SetTSDevice: %u", core_type); + rtError_t rt_ret = rtSetTSDevice(core_type); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "SetTSDevice failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } return SUCCESS; } // initialize op sequence and call initialization function of each op respectively -Status DavinciModel::Init(void *dev_ptr, size_t memsize, void *weight_ptr, size_t weight_size) { +Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) { // validating params GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(priority_ < 0 || priority_ > 7, return PARAM_INVALID, "Priority must between 0-7, now is %d", priority_); @@ -615,9 +692,13 @@ Status DavinciModel::Init(void *dev_ptr, size_t memsize, void *weight_ptr, size_ // Initializing runtime_param_ InitRuntimeParams(); + // RTS set aicore or vectorcore + GE_CHK_STATUS_RET(SetTSDevice(), "SetTSDevice failed"); + version_ = ge_model_->GetVersion(); name_ = ge_model_->GetName(); - + (void)ge::AttrUtils::GetBool(ge_model_, ATTR_NAME_SWITCH_FOR_L1_FUSION, is_l1_fusion_enable_); + GELOGD("The value of ge.l1Fusion in ge_model_ is %d.", is_l1_fusion_enable_); CheckHasHcomOp(); for (uint32_t i = 0; i < StreamNum(); i++) { @@ -642,16 +723,21 @@ Status DavinciModel::Init(void *dev_ptr, size_t memsize, void *weight_ptr, size_ event_list_.push_back(rt_event); } - for (uint32_t i = 0; ((BatchNum() != 0) && (i <= BatchNum())); i++) { - rtLabel_t rtLabel; - GE_CHK_RT_RET(rtLabelCreate(&rtLabel)); - GE_CHK_BOOL_RET_STATUS(rtLabel != nullptr, FAILED, "rtLabel is nullptr!"); - label_list_.push_back(rtLabel); + for (uint32_t i = 0; i < LabelNum(); i++) { + rtLabel_t rt_label; + GE_CHK_RT_RET(rtLabelCreate(&rt_label)); + GE_CHK_BOOL_RET_STATUS(rt_label != nullptr, FAILED, "rt_label is nullptr."); + label_list_.push_back(rt_label); } + // create model_handle to load model + GE_CHK_RT_RET(rtModelCreate(&rt_model_handle_, 0)); + GE_CHK_RT_RET(rtModelGetId(rt_model_handle_, &runtime_model_id_)); + Graph graph = ge_model_->GetGraph(); auto compute_graph = GraphUtils::GetComputeGraph(graph); - GE_CHK_BOOL_RET_STATUS(compute_graph != nullptr, INTERNAL_ERROR, "Get compute graph is nullptr!"); + compute_graph_ = compute_graph; + GE_CHK_BOOL_RET_STATUS(compute_graph != nullptr, INTERNAL_ERROR, "Get compute graph is nullptr."); runtime_param_.graph_id = GetGraphID(compute_graph->GetName()); @@ -661,11 +747,11 @@ Status DavinciModel::Init(void *dev_ptr, size_t memsize, void *weight_ptr, size_ GE_CHK_STATUS_RET(CopyVarData(compute_graph), "copy var data failed."); GE_TIMESTAMP_START(InitModelMem); - GE_CHK_STATUS_RET_NOLOG(InitModelMem(dev_ptr, memsize, weight_ptr, weight_size)); + GE_CHK_STATUS_RET_NOLOG(InitModelMem(dev_ptr, mem_size, weight_ptr, weight_size)); GE_TIMESTAMP_END(InitModelMem, "GraphLoader::InitModelMem"); data_inputer_ = new (std::nothrow) DataInputer(); - GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr!"); + GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr."); for (const ge::NodePtr &node : compute_graph->GetDirectNode()) { GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, continue); @@ -676,9 +762,6 @@ Status DavinciModel::Init(void *dev_ptr, size_t memsize, void *weight_ptr, size_ // for profiling op_name_map_ = compute_graph->GetGraphOpName(); - GE_TIMESTAMP_CALLNUM_START(LoadTBEKernelBinToOpDesc); - GE_TIMESTAMP_CALLNUM_START(InitTbeHandle); - vector op_name; GE_IF_BOOL_EXEC(ge::AttrUtils::GetListStr(ge_model_, ATTR_MODEL_TASK_INDEX_OP_NAME, op_name), GELOGI("get str of task_index_op_name")); @@ -689,72 +772,280 @@ Status DavinciModel::Init(void *dev_ptr, size_t memsize, void *weight_ptr, size_ GELOGI("infer profiling: op_name_size(%zu)", op_name.size()); } + if (InitNodes(compute_graph) != SUCCESS) { + return FAILED; + } + + SetDataDumperArgs(); + + GE_TIMESTAMP_START(DoTaskSink); + auto ret = DoTaskSink(); + GE_TIMESTAMP_END(DoTaskSink, "GraphLoader::DoTaskSink"); + + // collect profiling for ge + if (ProfilingManager::Instance().ProfilingOn()) { + std::vector compute_graph_desc_info; + Status ret1 = GetComputeGraphInfo(compute_graph_desc_info); + if (ret1 != SUCCESS) { + GELOGE(ret1, "GetComputeGraphInfo failed."); + return ret1; + } + ProfilingManager::Instance().ReportProfilingData(GetTaskDescInfo(), compute_graph_desc_info); + } + return ret; +} + +/// +/// @ingroup ge +/// @brief Travel all nodes and do some init. +/// @param [in] compute_graph: ComputeGraph to load. +/// @return Status +/// +Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { + uint32_t data_op_index = 0; + std::map> input_data_info; + + GE_TIMESTAMP_CALLNUM_START(LoadTBEKernelBinToOpDesc); + GE_TIMESTAMP_CALLNUM_START(InitTbeHandle); + auto nodes = compute_graph->GetAllNodes(); - tbekernel_store_ = ge_model_->GetTBEKernelStore(); + const TBEKernelStore &tbekernel_store = ge_model_->GetTBEKernelStore(); for (size_t i = 0; i < nodes.size(); i++) { auto node = nodes.at(i); - GE_CHK_BOOL_RET_STATUS(node != nullptr, PARAM_INVALID, "CreateOp failed."); - auto op_desc = node->GetOpDesc(); - GE_CHK_BOOL_RET_STATUS(op_desc != nullptr, PARAM_INVALID, "op_desc is null."); - op_list_[i] = op_desc; + if (op_desc == nullptr) { + GELOGE(PARAM_INVALID, "op_desc is null."); + return PARAM_INVALID; + } + + op_list_[op_desc->GetId()] = op_desc; GE_TIMESTAMP_RESTART(LoadTBEKernelBinToOpDesc); - tbekernel_store_.LoadTBEKernelBinToOpDesc(op_desc); + tbekernel_store.LoadTBEKernelBinToOpDesc(op_desc); GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc); - if (op_desc->GetType() == DATA_TYPE || op_desc->GetType() == AIPP_DATA_TYPE || - op_desc->GetType() == ANN_DATA_TYPE) { - data_op_list_.push_back(op_desc); - GE_IF_BOOL_EXEC( - (op_desc->GetInputDescPtr(0) != nullptr && op_desc->GetInputDescPtr(0)->GetFormat() != FORMAT_FILTER_HWCK), - data_op_input_tensor_desc_map_[op_desc->GetName()] = op_desc->GetInputDescPtr(0)); - GE_IF_BOOL_EXEC( - (op_desc->GetOutputDescPtr(0) != nullptr && op_desc->GetOutputDescPtr(0)->GetFormat() != FORMAT_FRACTAL_Z), - data_op_output_tensor_desc_map_[op_desc->GetName()] = op_desc->GetOutputDescPtr(0)); - SetOutsideAddr(ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc)); + if (IsDataOp(op_desc->GetType())) { + if (InitDataOp(node, data_op_index, input_data_info) != SUCCESS) { + GELOGE(PARAM_INVALID, "Data init failed, Name: %s", op_desc->GetName().c_str()); + return PARAM_INVALID; + } data_dumper_.SaveDumpInput(node); + continue; } - GE_IF_BOOL_EXEC(op_desc->GetType() == VARIABLE, variable_op_list_.push_back(op_desc)); + if (IsCallDumpInputOp(op_desc)) { + GELOGI("node[%s] is no task op , call SaveDumpInput to save it's output node info", op_desc->GetName().c_str()); + data_dumper_.SaveDumpInput(node); + continue; + } - GE_IF_BOOL_EXEC(op_desc->GetType() == NETOUTPUT, output_op_list_.push_back(op_desc); - GE_CHK_STATUS_RET(ModelUtils::GetOutputSize(op_desc, output_size_list_, output_memory_size_list_), - "Get output size fail"); - SetOutsideAddr(ModelUtils::GetInputDataAddrs(runtime_param_, op_desc))); + if (op_desc->GetType() == VARIABLE) { + variable_op_list_.push_back(op_desc); + continue; + } + + if (op_desc->GetType() == NETOUTPUT) { + if (InitNetOutput(op_desc) != SUCCESS) { + GELOGE(PARAM_INVALID, "NetOutput init failed, Name: %s", op_desc->GetName().c_str()); + return PARAM_INVALID; + } + continue; + } // Initialize constant op, only applies to training, ignoring inference constant op - GE_IF_BOOL_EXEC(op_desc->GetType() == CONSTANTOP, - GE_CHK_STATUS_RET(InitConstant(op_desc), "Constant init failed. %s", op_desc->GetName().c_str());); + if (op_desc->GetType() == CONSTANTOP) { + if (InitConstant(op_desc) != SUCCESS) { + GELOGE(PARAM_INVALID, "Constant init failed. %s", op_desc->GetName().c_str()); + return PARAM_INVALID; + } + continue; + } + + if (op_desc->GetType() == ENDGRAPH) { + end_graph_op_ = op_desc; + } GE_TIMESTAMP_RESTART(InitTbeHandle); uint32_t run_mode = static_cast(domi::ImplyType::INVALID); - GE_IF_BOOL_EXEC((AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, run_mode) && - run_mode == static_cast(domi::ImplyType::TVM)), - GE_CHK_STATUS_RET(InitTbeHandle(op_desc), "TBE init failed. %s", op_desc->GetName().c_str());); + if (AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, run_mode) && + run_mode == static_cast(domi::ImplyType::TVM)) { + // Skip no_task operator, such as concat and split. + bool attr_notask = false; + bool get_attr_notask_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_notask); + GE_IF_BOOL_EXEC(get_attr_notask_flag && attr_notask, + GELOGI("Node[name:%s, type:%s] does not generate task, skip initialization.", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + continue;); + + if (InitTbeHandle(op_desc) != SUCCESS) { + GELOGE(PARAM_INVALID, "TBE init failed. %s", op_desc->GetName().c_str()); + return PARAM_INVALID; + } + } GE_TIMESTAMP_ADD(InitTbeHandle); - GE_CHK_STATUS_RET(MarkActiveStream(op_desc), "MarkActiveStream failed, node:%s, opIndex:%zu", - op_desc->GetName().c_str(), i); + if (MarkActiveStream(op_desc) != SUCCESS) { + GELOGE(PARAM_INVALID, "MarkActiveStream failed, node:%s, opIndex:%zu", op_desc->GetName().c_str(), i); + return PARAM_INVALID; + } } - GE_TIMESTAMP_CALLNUM_END(LoadTBEKernelBinToOpDesc, "GraphLoader::LoadTBEKernelBinToOpDesc"); - GE_TIMESTAMP_CALLNUM_END(InitTbeHandle, "GraphLoader::InitTbeHandle"); - - SetDataDumperArgs(); - GE_TIMESTAMP_START(DoTaskSink); - auto ret = DoTaskSink(); - GE_TIMESTAMP_END(DoTaskSink, "GraphLoader::DoTaskSink"); + Status ret = CombineDataInfo(input_data_info); + GE_TIMESTAMP_CALLNUM_END(LoadTBEKernelBinToOpDesc, "GraphLoader::LoadTBEKernelBinToOpDesc."); + GE_TIMESTAMP_CALLNUM_END(InitTbeHandle, "GraphLoader::InitTbeHandle."); return ret; } +/// @ingroup ge +/// @brief Data Op Initialize. +/// @param [in] NodePtr: Data Op. +/// @param [in/out] data_op_index: NetOutput addr size info. +/// @param [in/out] input_data_info: Data index and addr info {index, {size, addr}}. +/// @return Status +Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, + std::map> &input_data_info) { + // op_desc Checked by Init: Data, valid. + auto op_desc = node->GetOpDesc(); + uint32_t parent_index = 0; // Ignore subgraph Data Node. + if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGI("Skip subgraph Data node: %s.", op_desc->GetName().c_str()); + return SUCCESS; + } + + data_op_list_.push_back(op_desc); + ConstGeTensorDescPtr input_desc = op_desc->GetInputDescPtr(kDataIndex); + if (input_desc != nullptr && input_desc->GetFormat() != FORMAT_FILTER_HWCK) { + data_op_input_tensor_desc_map_[op_desc->GetName()] = input_desc; + } + + ConstGeTensorDescPtr output_desc = op_desc->GetOutputDescPtr(kDataIndex); + if (output_desc != nullptr && output_desc->GetFormat() != FORMAT_FRACTAL_Z) { + data_op_output_tensor_desc_map_[op_desc->GetName()] = output_desc; + } + + // Make information for copy input data. + const vector output_size_list = ModelUtils::GetOutputSize(op_desc); + const vector output_addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc); + if (output_size_list.empty() || output_addr_list.empty() || (output_size_list.size() != output_addr_list.size())) { + GELOGE(PARAM_INVALID, "Data[%s] init failed: Output size is %zu, Output addr is %zu", op_desc->GetName().c_str(), + output_size_list.size(), output_addr_list.size()); + return PARAM_INVALID; + } + + auto data_index = data_op_index; + if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) { + GELOGI("ge_train:get new index %u, old %u", data_index, data_op_index); + } + + input_data_info[data_index] = {output_size_list[kDataIndex], output_addr_list[kDataIndex]}; + SetInputOutsideAddr(output_addr_list); + data_op_index++; + if (InitInputZeroCopyNodes(node) != SUCCESS) { + GELOGE(PARAM_INVALID, "Input zero copy nodes init failed!"); + return PARAM_INVALID; + } + return SUCCESS; +} + /// +/// @ingroup ge +/// @brief input zero copy node Initialize. +/// @param [in] NodePtr: Data Op. +/// @return Status +/// +Status DavinciModel::InitInputZeroCopyNodes(const NodePtr &node) { + auto out_data_anchor = node->GetOutDataAnchor(kDataIndex); + if (out_data_anchor == nullptr) { + GELOGE(FAILED, "Out data anchor is nullptr"); + return FAILED; + } + for (auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) { + auto node = peer_in_data_anchor->GetOwnerNode(); + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + GELOGE(FAILED, "Op desc is nullptr"); + return FAILED; + } + string batch_label; + (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label); + if (batch_label.empty()) { + batch_label = kDefaultBatchLable; + } + if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) { + zero_copy_op_id_batch_label_.emplace(pair(op_desc->GetId(), batch_label)); + GELOGD("Init input zero copy nodes success, op name:%s, op id: %ld, batch label: %s.", op_desc->GetName().c_str(), + op_desc->GetId(), batch_label.c_str()); + } + } + return SUCCESS; +} + +/// @ingroup ge +/// @brief NetOutput Op Initialize. +/// @param [in] op_desc: NetOutput Op descriptor. +/// @return Status +Status DavinciModel::InitNetOutput(const OpDescPtr &op_desc) { + // op_desc Checked by Init: NetOutput, valid. + uint32_t parent_index = 0; // Ignore subgraph NetOutput Node. + if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGI("Skip subgraph NetOutput node: %s.", op_desc->GetName().c_str()); + return SUCCESS; + } + + output_op_list_.push_back(op_desc); + std::vector output_size_list; // useless, just for check. + if (ModelUtils::GetOutputSize(op_desc, output_size_list, output_memory_size_list_) != SUCCESS) { + GELOGE(PARAM_INVALID, "Get output size failed: %s", op_desc->GetName().c_str()); + return PARAM_INVALID; + } + + // Make information for copy output data. + const vector input_size_list = ModelUtils::GetInputSize(op_desc); + const vector input_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc); + if (input_size_list.empty() && input_addr_list.empty()) { + GELOGI("NetOutput[%s] is empty.", op_desc->GetName().c_str()); + return SUCCESS; + } + if (input_size_list.empty() || input_size_list.size() != input_addr_list.size() || + input_size_list.size() != output_size_list.size()) { + GELOGE(PARAM_INVALID, "NetOutput[%s] init failed: Input size is %zu, Input addr is %zu, Output size is %zu", + op_desc->GetName().c_str(), input_size_list.size(), input_addr_list.size(), output_size_list.size()); + return PARAM_INVALID; + } + + output_size_list_.insert(output_size_list_.end(), input_size_list.begin(), input_size_list.end()); + output_addr_list_.insert(output_addr_list_.end(), input_addr_list.begin(), input_addr_list.end()); + SetOutputOutsideAddr(input_addr_list); + return SUCCESS; +} + +/// @ingroup ge +/// @brief Make Input and Output addr for feature use. +/// @param [in] input_data_info: Data index and addr info {index, {size, addr}}. +/// @return Status +Status DavinciModel::CombineDataInfo(const std::map> &input_data_info) { + input_size_list_.resize(data_op_list_.size()); + input_addr_list_.resize(data_op_list_.size()); + for (size_t index = 0; index < data_op_list_.size(); ++index) { + auto it = input_data_info.find(index); + if (it == input_data_info.end()) { + GELOGE(PARAM_INVALID, "Data init failed: index %zu, Data Op size is %zu, Input addr is %zu", index, + data_op_list_.size(), input_data_info.size()); + return INTERNAL_ERROR; + } + input_size_list_[index] = it->second.first; + input_addr_list_[index] = it->second.second; + } + + GELOGI("Data init success, input size %zu, output size %zu", input_size_list_.size(), output_size_list_.size()); + return SUCCESS; +} + /// @ingroup ge /// @brief ACL case, Load task list with queue. /// @param [in] input_queue_ids: input queue ids from user, nums equal Data Op. /// @param [in] output_queue_ids: input queue ids from user, nums equal NetOutput Op. -/// @return: 0 for success / others for fail -/// +/// @return: 0 for success / others for failed Status DavinciModel::SetQueIds(const std::vector &input_queue_ids, const std::vector &output_queue_ids) { if (input_queue_ids.empty() && output_queue_ids.empty()) { @@ -768,29 +1059,274 @@ Status DavinciModel::SetQueIds(const std::vector &input_queue_ids, } /// -/// @brief define static mode and mutex mode +/// @ingroup ge +/// @brief ACL case, Load task list with queue. +/// @param [in] input_que_ids: input queue ids from user, nums equal Data Op. +/// @param [in] output_que_ids: input queue ids from user, nums equal NetOutput Op. +/// @return: 0 for success / others for failed /// -SysMode DavinciModel::mode_ = INFERENCE; -std::mutex DavinciModel::mutex_mode_; +Status DavinciModel::LoadWithQueue() { + if (input_queue_ids_.empty() && output_queue_ids_.empty()) { + return SUCCESS; + } + + if (input_queue_ids_.size() != data_op_list_.size()) { + GELOGE(PARAM_INVALID, "Input queue ids not match model: input_queue=%zu input_data=%zu", input_queue_ids_.size(), + data_op_list_.size()); + return PARAM_INVALID; + } + + if (output_queue_ids_.size() != output_size_list_.size()) { + GELOGE(PARAM_INVALID, "Output queue ids not match model: output_queue=%zu output_data=%zu", + output_queue_ids_.size(), output_size_list_.size()); + return PARAM_INVALID; + } + + // create stream instance which rt_model_handel is running on, this is S0. + GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_AICPU)); + is_inner_model_stream_ = true; + GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, rt_model_stream_, 0)); + + // Binding input_queue and Data Op. + GE_CHK_STATUS_RET(BindInputQueue(), "Launch bind input queue failed."); + + GE_CHK_STATUS_RET(BindActiveStream(), "Launch active entry stream failed."); + GE_CHK_STATUS_RET(CpuWaitEndGraph(), "Launch wait end graph failed."); + + // Binding output_queue and NetOutput Op. + GE_CHK_STATUS_RET(BindOutputQueue(), "Launch bind output queue failed."); + GE_CHK_STATUS_RET(CpuModelRepeat(), "Launch model repeat failed."); + + return SUCCESS; +} + +/// @ingroup ge +/// @brief queue schedule, Bind input queue to Data output address. +/// @return: 0 for success / others for failed +Status DavinciModel::BindInputQueue() { + // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size() + for (size_t i = 0; i < input_queue_ids_.size(); ++i) { + uint32_t queue_id = input_queue_ids_[i]; + uint32_t data_size = input_size_list_[i]; + uintptr_t data_addr = reinterpret_cast(input_addr_list_[i]); + GELOGI("BindInputToQueue: graph_%u index[%zu] queue id[%u] output addr[0x%lx] output size[%u]", + runtime_param_.graph_id, i, queue_id, data_addr, data_size); + + if (rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_INPUT_QUEUE) != RT_ERROR_NONE) { + return INTERNAL_ERROR; + } + + if (CpuModelDequeue(queue_id, data_addr, data_size) != SUCCESS) { + return INTERNAL_ERROR; + } + } + + return SUCCESS; +} + +/// @ingroup ge +/// @brief queue schedule, bind output queue to NetOutput input address. +/// @return: 0 for success / others for failed +Status DavinciModel::BindOutputQueue() { + // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size() + for (size_t i = 0; i < output_queue_ids_.size(); ++i) { + uint32_t queue_id = output_queue_ids_[i]; + uint32_t data_size = output_size_list_[i]; + uintptr_t data_addr = reinterpret_cast(output_addr_list_[i]); + GELOGI("BindOutputToQueue: graph_%u index[%zu] queue id[%u] input addr[0x%lx] input size[%u]", + runtime_param_.graph_id, i, queue_id, data_addr, data_size); + + if (rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_OUTPUT_QUEUE) != RT_ERROR_NONE) { + return INTERNAL_ERROR; + } + + if (CpuModelEnqueue(queue_id, data_addr, data_size) != SUCCESS) { + return INTERNAL_ERROR; + } + } + + return SUCCESS; +} + +/// @ingroup ge +/// @brief queue schedule, active stream will schedule by S0. +/// @return: 0 for success / others for failed +Status DavinciModel::BindActiveStream() { + // Stream not in active_stream_indication_ is active stream. + std::vector active_stream_list; + for (size_t i = 0; i < stream_list_.size(); ++i) { + if (active_stream_indication_.count(i) == 0) { + active_stream_list.push_back(stream_list_[i]); + active_stream_indication_.insert(i); // deactive all model stream. + } + } + + // Active stream add to active entry, will active by S0. + if (CpuActiveStream(active_stream_list) != SUCCESS) { + return INTERNAL_ERROR; + } + + return SUCCESS; +} + +/// @ingroup ge +/// @brief definiteness queue schedule, bind input queue to task. +/// @param [in] queue_id: input queue id from user. +/// @param [in] addr: Data Op output tensor address. +/// @param [in] size: Data Op output tensor size. +/// @return: 0 for success / others for failed +Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t size) { + GELOGI("Set CpuKernel model dequeue task enter."); + std::shared_ptr dequeue_task = MakeShared(rt_model_stream_); + if (dequeue_task == nullptr) { + GELOGE(FAILED, "Make CpuTaskModelDequeue task failed."); + return FAILED; + } + + // Get DataOp Output address and bind to queue. + uintptr_t in_mbuf = 0; + if (dequeue_task->Init(queue_id, in_mbuf) != SUCCESS) { + return FAILED; + } + + std::shared_ptr prepare_input = MakeShared(rt_model_stream_); + if (dequeue_task == nullptr) { + GELOGE(FAILED, "Make CpuTaskPrepareInput task failed."); + return FAILED; + } + + if (prepare_input->Init(addr, size, in_mbuf) != SUCCESS) { + return FAILED; + } + + cpu_task_list_.push_back(dequeue_task); + cpu_task_list_.push_back(prepare_input); + input_mbuf_list_.push_back(in_mbuf); + GELOGI("Set CpuKernel model dequeue task success."); + return SUCCESS; +} + +/// @ingroup ge +/// @brief definiteness queue schedule, bind output queue to task. +/// @param [in] queue_id: output queue id from user. +/// @param [in] addr: NetOutput Op input tensor address. +/// @param [in] size: NetOutput Op input tensor size. +/// @return: 0 for success / others for failed +Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t size) { + GELOGI("Set CpuKernel model enqueue task enter."); + if (input_mbuf_list_.empty()) { + GELOGE(FAILED, "Need input mbuf for fill output mbuf head info."); + return FAILED; + } + + std::shared_ptr prepare_output = MakeShared(rt_model_stream_); + if (prepare_output == nullptr) { + GELOGE(FAILED, "Make CpuTaskPrepareOutput task failed."); + return FAILED; + } + + uintptr_t out_mbuf = 0; + if (prepare_output->Init(addr, size, input_mbuf_list_[0], out_mbuf) != SUCCESS) { + return FAILED; + } + + std::shared_ptr model_enqueue = MakeShared(rt_model_stream_); + if (model_enqueue == nullptr) { + GELOGE(FAILED, "Make CpuTaskModelEnqueue task failed."); + return FAILED; + } + + if (model_enqueue->Init(queue_id, out_mbuf) != SUCCESS) { + return FAILED; + } + + cpu_task_list_.push_back(prepare_output); + cpu_task_list_.push_back(model_enqueue); + output_mbuf_list_.push_back(out_mbuf); + GELOGI("Set CpuKernel model enqueue task success."); + return SUCCESS; +} + +/// +/// @ingroup ge +/// @brief definiteness queue schedule, active original model stream. +/// @param [in] streams: streams will active by S0. +/// @return: 0 for success / others for failed +/// +Status DavinciModel::CpuActiveStream(const std::vector &stream_list) { + GELOGI("Set CpuKernel active stream task:%zu enter.", stream_list.size()); + for (auto s : stream_list) { + std::shared_ptr active_entry = MakeShared(rt_model_stream_); + if (active_entry == nullptr) { + GELOGE(FAILED, "Make CpuTaskActiveEntry task failed."); + return FAILED; + } + + if (active_entry->Init(s) != SUCCESS) { + return FAILED; + } + + cpu_task_list_.push_back(active_entry); + } + + GELOGI("Set CpuKernel active stream task success."); + return SUCCESS; +} + +/// @ingroup ge +/// @brief definiteness queue schedule, wait for end graph. +/// @return: 0 for success / others for failed +Status DavinciModel::CpuWaitEndGraph() { + GELOGI("Set CpuKernel wait end graph task enter."); + std::shared_ptr wait_endgraph = MakeShared(rt_model_stream_); + if (wait_endgraph == nullptr) { + GELOGE(FAILED, "Make CpuTaskWaitEndGraph task failed."); + return FAILED; + } + + if (wait_endgraph->Init(runtime_model_id_) != SUCCESS) { + return FAILED; + } + + cpu_task_list_.push_back(wait_endgraph); + GELOGI("Set CpuKernel wait end graph task success."); + return SUCCESS; +} + +/// @ingroup ge +/// @brief definiteness queue schedule, repeat run model. +/// @return: 0 for success / others for failed +Status DavinciModel::CpuModelRepeat() { + GELOGI("Set CpuKernel repeat task enter."); + std::shared_ptr model_repeat = MakeShared(rt_model_stream_); + if (model_repeat == nullptr) { + GELOGE(FAILED, "Make CpuTaskModelRepeat task failed."); + return FAILED; + } + + if (model_repeat->Init(runtime_model_id_) != SUCCESS) { + return FAILED; + } + + cpu_task_list_.push_back(model_repeat); + GELOGI("Set CpuKernel repeat task success."); + return SUCCESS; +} -/// /// @ingroup domi_ome /// @brief get sys mode /// @return SysMode required system mode /// @author -/// SysMode DavinciModel::GetSysMode() { std::unique_lock lock(mutex_mode_); return mode_; } -/// /// @ingroup domi_ome /// @brief set sys mode /// @param [in] mode to be set /// @return Status mode set result /// @author -/// Status DavinciModel::SetSysMode(SysMode mode) { GE_CHK_BOOL_RET_STATUS(mode < RESERVED, PARAM_INVALID, "DavinciModel::SetSysMode Para Error"); @@ -805,11 +1341,11 @@ Status DavinciModel::GetInputOutputDescInfo(vector &input_d GELOGI("data_op_list_ is empty or input_desc size is not 1."); } else { std::vector input_formats; - GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed"); + GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed."); } - std::vector output_formats; - GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get output desc info failed"); + std::vector outputFormats; + GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get output desc info failed."); return SUCCESS; } @@ -817,25 +1353,24 @@ Status DavinciModel::GetInputOutputDescInfo(vector &input_d Status DavinciModel::GetInputOutputDescInfoForZeroCopy(vector &input_desc, vector &output_desc) { if ((data_op_list_.empty()) || (data_op_list_[0]->GetInputsSize()) != 1) { - GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!"); + GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1."); return FAILED; } std::vector input_formats; - GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed"); - std::vector output_formats; - GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get output desc info failed"); + GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed."); + std::vector outputFormats; + GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get output desc info failed."); - GE_CHK_BOOL_RET_STATUS(output_desc.size() == output_size_list_.size(), INTERNAL_ERROR, + GE_CHK_BOOL_RET_STATUS(output_desc.size() == output_memory_size_list_.size(), INTERNAL_ERROR, "output_desc size[%zu] not equal output_size_list_[%zu] size!", output_desc.size(), - output_size_list_.size()); + output_memory_size_list_.size()); - GE_CHECK_GE(output_memory_size_list_.size(), output_size_list_.size()); /// For function zero copy,the memory should be aligned by 512 bytes. /// And, because of the cce op limit, size should be lager than the real shape size. The memory should be padded by 32 /// bytes. /// *size equals to ((tensorDesc->dataSize + 2 * 32 - 1) / 32) * 32; - for (size_t i = 0; i < output_size_list_.size(); i++) { + for (size_t i = 0; i < output_memory_size_list_.size(); i++) { output_desc[i].size = output_memory_size_list_[i]; } @@ -845,7 +1380,7 @@ Status DavinciModel::GetInputOutputDescInfoForZeroCopy(vector &input_desc, vector &output_desc, std::vector &input_formats, - std::vector &output_formats) { + std::vector &outputFormats) { if ((data_op_list_.empty()) || (data_op_list_[0]->GetInputsSize()) != 1) { GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!"); return FAILED; @@ -853,7 +1388,47 @@ Status DavinciModel::GetInputOutputDescInfo(vector &input_d GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed"); - GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get ouput desc info failed"); + GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get ouput desc info failed"); + + return SUCCESS; +} + +/// +/// @ingroup domi_ome +/// @brief Get dynamic batch_info +/// @param [out] batch_info +/// @return execute result +/// +Status DavinciModel::GetDynamicBatchInfo(std::vector> &batch_info) { + for (auto &iter : op_list_) { + OpDescPtr op_desc = iter.second; + if (op_desc == nullptr) { + GELOGE(FAILED, "op_desc is null, index=%u.", iter.first); + return FAILED; + } + + if (op_desc->GetType() != STREAMSWITCHN) { + continue; + } + + batch_info.clear(); + uint32_t batch_num = 0; + if (!AttrUtils::GetInt(op_desc, ATTR_NAME_BATCH_NUM, batch_num)) { + GELOGE(FAILED, "Failed to get attr ATTR_NAME_BATCH_NUM, StreamSwitchN: %s.", op_desc->GetName().c_str()); + return FAILED; + } + std::vector batch_shape; + for (uint32_t i = 0; i < batch_num; i++) { + batch_shape.clear(); + const std::string attr_name = ATTR_NAME_PRED_VALUE + "_" + std::to_string(i); + if (!AttrUtils::GetListInt(op_desc, attr_name, batch_shape)) { + GELOGE(FAILED, "Failed to get attr ATTR_NAME_PRED_VALUE, StreamSwitchN: %s.", op_desc->GetName().c_str()); + return FAILED; + } + batch_info.emplace_back(batch_shape); + } + break; + } return SUCCESS; } @@ -861,7 +1436,7 @@ Status DavinciModel::GetInputOutputDescInfo(vector &input_d Status DavinciModel::GetInputOutputDescInfoForZeroCopy(vector &input_desc, vector &output_desc, std::vector &input_formats, - std::vector &output_formats) { + std::vector &outputFormats) { if ((data_op_list_.empty()) || (1 != data_op_list_[0]->GetInputsSize())) { GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!"); return FAILED; @@ -869,18 +1444,17 @@ Status DavinciModel::GetInputOutputDescInfoForZeroCopy(vectordataSize + 2 * 32 - 1) / 32) * 32; - for (size_t i = 0; i < output_size_list_.size(); i++) { + for (size_t i = 0; i < output_memory_size_list_.size(); i++) { output_desc[i].size = output_memory_size_list_[i]; } @@ -893,14 +1467,13 @@ Status DavinciModel::GetInputDescInfo(vector &input_desc, s uint32_t n, c, h, w; GE_CHECK_NOTNULL(data_op_list_[index]); GE_CHECK_NOTNULL(data_op_list_[index]->GetInputDescPtr(0)); - Format format = data_op_list_[index]->GetOutputDescPtr(0)->GetFormat(); + Format format = data_op_list_[index]->GetInputDescPtr(0)->GetFormat(); n = format == FORMAT_NHWC ? NHWC_DIM_N : NCHW_DIM_N; c = format == FORMAT_NHWC ? NHWC_DIM_C : NCHW_DIM_C; h = format == FORMAT_NHWC ? NHWC_DIM_H : NCHW_DIM_H; w = format == FORMAT_NHWC ? NHWC_DIM_W : NCHW_DIM_W; - if (data_op_list_[index]->GetInputDescPtr(0)->GetShape().GetDimNum() == - static_cast(domi::NORMAL_TENSOR_SIZE)) { + if (data_op_list_[index]->GetInputDescPtr(0)->GetShape().GetDimNum() == static_cast(NORMAL_TENSOR_SIZE)) { input.shape_info.num = data_op_list_[index]->GetInputDescPtr(0)->GetShape().GetDim(n); input.shape_info.height = data_op_list_[index]->GetInputDescPtr(0)->GetShape().GetDim(h); input.shape_info.width = data_op_list_[index]->GetInputDescPtr(0)->GetShape().GetDim(w); @@ -912,7 +1485,7 @@ Status DavinciModel::GetInputDescInfo(vector &input_desc, s input.data_type = data_op_list_[index]->GetInputDescPtr(0)->GetDataType(); input.name = data_op_list_[index]->GetName(); - uint32_t input_size = 0; + int64_t input_size = 0; GE_CHK_STATUS_RET(TensorUtils::GetSize(*data_op_list_[index]->GetInputDescPtr(0), input_size), "get input size failed."); input.size = input_size; @@ -965,7 +1538,7 @@ void DavinciModel::CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputD } int64_t tensor_size = 0; - (void)TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size); + (void)TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size); // no need to check value output.size = static_cast(tensor_size); output.data_type = op_desc->GetInputDescPtr(index)->GetDataType(); } @@ -1022,7 +1595,7 @@ Status DavinciModel::CopyInputData(const InputData ¤t_data, bool device_da for (auto op_desc : data_op_list_) { ret = CopyInputDataToModel(current_data.blobs, data_op_index, device_data); - GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret fail, index:%u, model id:%u", + GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret failed, index:%u, model id:%u", current_data.index, current_data.model_id); data_op_index++; } @@ -1050,12 +1623,300 @@ Status DavinciModel::SyncVarData() { for (auto op_desc : variable_op_list_) { ret = VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_); - GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret fail, model id:%u, op name:%s", model_id_, + GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_, op_desc->GetName().c_str()); } return ret; } +inline int64_t SumSize(const vector &size_list) { + int64_t sum_size = 0; + for (const int64_t &size : size_list) { + sum_size += size; + } + return sum_size; +} + +Status DavinciModel::SinkModelProfile(std::shared_ptr &model) { + GE_CHECK_NOTNULL(model); + // not support non-sink model + GE_CHK_BOOL_EXEC(model->model_task_def_ != nullptr, return SUCCESS); + + // profiling plugin must be registered + Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter(); + if (reporter == nullptr) { + GELOGI("Profiling report is nullptr!"); + return SUCCESS; + } + + GELOGI("Start collect model load profiling data."); + + Msprof::Engine::ReporterData reporter_data{}; + // report model data tag name + std::string tag_name; + tag_name.append("model_load_info_").append(std::to_string(model->Id())); + GE_CHK_BOOL_EXEC(memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN, tag_name.c_str(), tag_name.size()) == EOK, + return FAILED, "Sink model tag memcpy error."); + + // Model Header + string name = model->Name(); + int32_t name_len = name.size(); + reporter_data.deviceId = device_id_; + reporter_data.data = (unsigned char *)&name_len; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + reporter_data.data = (unsigned char *)name.c_str(); + reporter_data.dataLen = name.size(); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + uint32_t model_id = model->Id(); + reporter_data.data = (unsigned char *)&model_id; + reporter_data.dataLen = sizeof(uint32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + // Load Start/End Time + int64_t start_time = model->GetLoadBeginTime(); + reporter_data.data = (unsigned char *)&start_time; + reporter_data.dataLen = sizeof(int64_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + int64_t end_time = model->GetLoadEndTime(); + reporter_data.data = (unsigned char *)&end_time; + reporter_data.dataLen = sizeof(int64_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + auto task_list = model->GetTaskList(); + auto op_list = model->GetOpList(); + + int32_t task_num = task_list.size(); + std::multimap op_id_map; + std::set task_id_set; + for (int32_t i = 0; i < task_num; i++) { + auto task = task_list[i]; + auto fusion_op_info = task->GetFusionOpInfo(); + + // when type is RT_MODEL_TASK_KERNEL, ctx is not null + if (fusion_op_info != nullptr) { + uint32_t op_num = fusion_op_info->original_op_names.size(); + uint32_t task_id = task->GetTaskID(); + if (op_num > 0) { + GELOGI("task.id = %u, opNum = %u", task_id, op_num); + op_id_map.insert(std::make_pair(fusion_op_info->op_index, task_id)); + } + } + } + + struct memoryInfo { + int64_t input_size; + int64_t output_size; + int64_t weight_size; + int64_t workspace_size; + int64_t total_size; + + memoryInfo() : input_size(0), output_size(0), weight_size(0), workspace_size(0), total_size(0) {} + }; + + using CIT = std::multimap::const_iterator; + using Range = std::pair; + for (int32_t i = 0; i < task_num; i++) { + auto task = task_list[i]; + auto fusion_op_info = task->GetFusionOpInfo(); + if (fusion_op_info != nullptr && fusion_op_info->original_op_names.size() > 0) { + uint32_t task_id = task->GetTaskID(); + uint32_t op_num = fusion_op_info->original_op_names.size(); + uint32_t task_count = 0; + if (task_id_set.count(task_id) != 0) { + continue; + } + + uint32_t op_id = fusion_op_info->op_index; + Range range = op_id_map.equal_range(op_id); + for (CIT range_idx = range.first; range_idx != range.second; ++range_idx) { + task_count++; + uint32_t task_id = range_idx->second; + task_id_set.insert(task_id); + } + + // op name after fusion + string fusion_op_name = fusion_op_info->op_name; + int32_t fusion_op_name_len = fusion_op_name.size() == 0 ? 1 : fusion_op_name.size(); + reporter_data.data = (unsigned char *)&fusion_op_name_len; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + reporter_data.data = (unsigned char *)fusion_op_name.c_str(); + reporter_data.dataLen = fusion_op_name_len; + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + // original op name before fusion + reporter_data.data = (unsigned char *)&op_num; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + for (uint32_t k = 0; k < op_num; k++) { + std::string op_name = fusion_op_info->original_op_names[k]; + int32_t op_name_len = op_name.size() == 0 ? 1 : op_name.size(); + reporter_data.data = (unsigned char *)&op_name_len; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + reporter_data.data = (unsigned char *)op_name.c_str(); + reporter_data.dataLen = op_name_len; + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + } + + // stream id info + uint32_t streamId = fusion_op_info->stream_id; + reporter_data.data = (unsigned char *)&streamId; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + // memory info + struct memoryInfo memory_info; + uint32_t op_index = fusion_op_info->op_index; + auto iter = op_list.find(op_index); + GE_CHK_BOOL_EXEC(iter != op_list.end(), return FAILED, "index is out of range, index: %u", op_index); + auto op_desc = iter->second; + memory_info.input_size = SumSize(ModelUtils::GetInputSize(op_desc)); + memory_info.output_size = SumSize(ModelUtils::GetOutputSize(op_desc)); + memory_info.workspace_size = SumSize(ModelUtils::GetWorkspaceSize(op_desc)); + memory_info.weight_size = SumSize(ModelUtils::GetWeightSize(op_desc)); + memory_info.total_size = + memory_info.weight_size + memory_info.input_size + memory_info.output_size + memory_info.workspace_size; + reporter_data.data = (unsigned char *)&memory_info; + reporter_data.dataLen = sizeof(struct memoryInfo); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + // task info + reporter_data.data = (unsigned char *)&task_count; + reporter_data.dataLen = sizeof(uint32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + + Range task_range = op_id_map.equal_range(op_id); + for (CIT idx = task_range.first; idx != task_range.second; ++idx) { + uint32_t task_id = idx->second; + reporter_data.data = (unsigned char *)&task_id; + reporter_data.dataLen = sizeof(uint32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + model->Id()); + } + } + } + return SUCCESS; +} + +Status DavinciModel::SinkTimeProfile(const InputData ¤t_data) { + // not support non-sink model + GE_CHK_BOOL_EXEC(this->model_task_def_ != nullptr, return SUCCESS); + + // profiling plugin must be registered + Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter(); + if (reporter == nullptr) { + GELOGI("Profiling report is nullptr!"); + return SUCCESS; + } + + Msprof::Engine::ReporterData reporter_data{}; + // report model data tag name + std::string tag_name; + tag_name.append("model_time_info_") + .append(std::to_string(this->Id())) + .append("_") + .append(std::to_string(current_data.index)); + + GE_CHK_BOOL_EXEC(memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN, tag_name.c_str(), tag_name.size()) == EOK, + return FAILED, "Sink model tag memcpy error."); + // device id + reporter_data.deviceId = device_id_; + + // Model Header + string name = this->Name(); + int32_t name_len = name.size(); + reporter_data.data = (unsigned char *)&name_len; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + this->Id()); + + reporter_data.data = (unsigned char *)name.c_str(); + reporter_data.dataLen = name.size(); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.", + this->Id()); + + // request id + uint64_t request_id = current_data.request_id; + reporter_data.data = (unsigned char *)&request_id; + reporter_data.dataLen = sizeof(uint32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, + "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index); + + // thread id + int32_t thread_id = GetDataInputTid(); + reporter_data.data = (unsigned char *)&thread_id; + reporter_data.dataLen = sizeof(int32_t); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, + "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index); + + // time info + time_info_.modelId = this->Id(); + reporter_data.data = (unsigned char *)&time_info_; + reporter_data.dataLen = sizeof(struct timeInfo); + GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, + "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index); + + return SUCCESS; +} + +void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) { + int64_t time = endTime; + + if (time == 0) { + mmTimespec timespec = mmGetTickCount(); + time = timespec.tv_sec * 1000 * 1000 * 1000 + timespec.tv_nsec; // 1000 ^ 3 converts second to nanosecond + } + + switch (stage) { + case MODEL_LOAD_START: + load_begin_time_ = time; + break; + case MODEL_LOAD_END: + load_end_time_ = time; + break; + case MODEL_PRE_PROC_START: + time_info_.processBeginTime = time; + break; + case MODEL_PRE_PROC_END: + time_info_.processEndTime = time; + break; + case MODEL_INFER_START: + time_info_.inferenceBeginTime = time; + break; + case MODEL_INFER_END: + time_info_.inferenceEndTime = time; + break; + case MODEL_AFTER_PROC_START: + time_info_.dumpBeginTime = time; + break; + case MODEL_AFTER_PROC_END: + time_info_.dumpEndTime = time; + break; + default: + break; + } + return; +} /// /// @ingroup domi_ome /// @brief copy input data to Model's firat OP. Address already malloced when Load @@ -1073,7 +1934,7 @@ Status DavinciModel::CopyInputDataToModel(const std::vector &data, u data_op_list_.size()); GE_CHK_BOOL_RET_STATUS(data_op_index < data_op_list_.size(), PARAM_INVALID, - "input data op index(%u) is invalid, exceeds input op size(%zu)", data_op_index, + "input data op index(%zu) is invalid, exceeds input op size(%zu)", data_op_index, data_op_list_.size()); /// input datatype conversion, converting FLOAT to FP16, 4D to 5D at the same time. @@ -1091,41 +1952,27 @@ Status DavinciModel::CopyInputDataToModel(const std::vector &data, u "Data Op has invalid input_desc_size(%zu) or output_desc_size(%zu)", op_def->GetInputsSize(), op_def->GetOutputsSize()); - uint32_t input_size = 0; - GE_CHK_STATUS(TensorUtils::GetSize(*op_def->GetInputDescPtr(0), input_size), "get input size failed."); - - GE_CHK_BOOL_RET_STATUS(input_size >= data[data_index].length, PARAM_INVALID, - "input data size(%u) does not match model required size(%u), ret fail.", - data[data_index].length, input_size); - // float to float16 bool need_trans_flag = ModelUtils::IsInputTensorNeedTrans(data_op_list_[data_op_index], 0); - uint32_t output_size = 0; + int64_t output_size = 0; GE_CHK_STATUS(TensorUtils::GetSize(*op_def->GetOutputDescPtr(0), output_size), "get output size failed."); + GE_CHK_BOOL_RET_STATUS(output_size >= data[data_index].length, PARAM_INVALID, + "input data size(%u) does not match model required size(%zu), ret failed.", + data[data_index].length, output_size); vector outputs = op_def->GetOutputOffset(); - GE_CHECK_VECTOR_NOT_EMPTY(outputs); - - bool need_memset = false; - (void)AttrUtils::GetBool(op_def, "_need_memset", need_memset); - if (need_memset) { - void *data_out_addr = mem_base_ + outputs[0]; - // data+allreduce output align 512 - uint32_t output_size_align = (output_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE; - GE_CHK_RT_RET(rtMemset(data_out_addr, output_size_align + 1, 0U, output_size_align)); - } if (device_data) { - return CopyPlainData(data, data_index, data_op_index, outputs, output_size, RT_MEMCPY_DEVICE_TO_DEVICE); + return CopyPlainData(data, data_index, data_op_index, outputs, RT_MEMCPY_DEVICE_TO_DEVICE); } else if (need_trans_flag) { - return CopyTransData(data, data_index, data_op_index, outputs, output_size); + return CopyTransData(data, data_index, data_op_index, outputs); } else { - return CopyPlainData(data, data_index, data_op_index, outputs, output_size, RT_MEMCPY_HOST_TO_DEVICE); + return CopyPlainData(data, data_index, data_op_index, outputs, RT_MEMCPY_HOST_TO_DEVICE); } } Status DavinciModel::CopyTransData(const std::vector &data, uint32_t data_index, uint32_t data_op_index, - const std::vector &outputs, uint32_t output_size) { + const std::vector &outputs) { GE_CHECK_VECTOR_NOT_EMPTY(outputs); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(outputs[0] == -1, return PARAM_INVALID, "output offset is -1"); GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size()); @@ -1147,7 +1994,7 @@ Status DavinciModel::CopyTransData(const std::vector &data, uint32_t auto ret = formats::TransDataType({src_data, static_cast(src_data_size), src_data_type, dst_data_type}, tmp_result); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %zu, error code %d", + GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %zu, error code %u", TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), src_data_size, ret); @@ -1155,20 +2002,20 @@ Status DavinciModel::CopyTransData(const std::vector &data, uint32_t } void *mem_addr = mem_base_ + outputs[0]; - auto rt_ret = rtMemcpy(mem_addr, runtime_param_.mem_size - outputs[0], tmp_result.data.get(), tmp_result.length, + auto rt_ret = rtMemcpy(mem_addr, static_cast(runtime_param_.mem_size - outputs[0]), + reinterpret_cast(tmp_result.data.get()), static_cast(tmp_result.length), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", tmp_result.length); return RT_FAILED; } - GELOGI("[IMAS]CopyTransData memcpy graph_%u type[F] name[%s] output[%d] datasize[%zu]", runtime_param_.graph_id, - data_op_list_[data_op_index]->GetName().c_str(), 0, tmp_result.length); + GELOGI("[IMAS]CopyTransData memcpy graph_%u type[F] name[%s] output[%d] memaddr[%p] datasize[%zu]", + runtime_param_.graph_id, data_op_list_[data_op_index]->GetName().c_str(), 0, mem_addr, tmp_result.length); return SUCCESS; } Status DavinciModel::CopyPlainData(const std::vector &data, uint32_t data_index, uint32_t data_op_index, - const std::vector &outputs, uint32_t output_size, - rtMemcpyKind_t kind) { + const std::vector &outputs, rtMemcpyKind_t kind) { GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size()); bool flag = data[data_index].isDataSupportMemShare && support_mem_shared_flag_; // if data attr support zero cpy,then update addrs info to flowtable @@ -1207,13 +2054,13 @@ Status DavinciModel::CopyPlainData(const std::vector &data, uint32_t /// @return Status result /// @author /// -Status DavinciModel::CopyOutputData(uint32_t model_id, uint32_t data_id, OutputData &output_data) { +Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data) { Status ret = SUCCESS; if (output_op_list_.empty()) { ret = SyncVarData(); } else { output_data.index = data_id; - output_data.model_id = model_id; + output_data.model_id = model_id_; GE_CHK_BOOL_RET_STATUS(output_data.blobs.size() == output_size_list_.size(), INTERNAL_ERROR, "output buffer size[%zu] not equal output_size_list[%zu] size!", output_data.blobs.size(), output_size_list_.size()); @@ -1222,12 +2069,12 @@ Status DavinciModel::CopyOutputData(uint32_t model_id, uint32_t data_id, OutputD uint32_t output_data_index = 0; for (auto &op_desc : output_op_list_) { ret = CopyOutputDataToUser(op_desc, output_data.blobs, output_data_index); - GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret fail, index:%u, model id:%u", + GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret failed, index:%u, model id:%u", output_data.index, output_data.model_id); } } - (void)DumpOpInputOutput(op_list_, model_id); // dump, not care result. + (void)DumpOpInputOutput(); // dump, not care result. return ret; } @@ -1236,7 +2083,7 @@ Status DavinciModel::CopyOutputDataToUser(OpDescPtr &op_desc, std::vector v_output_size; + vector v_output_size; vector v_output_data_addr; model_output.GetOutputData(v_output_data_addr, v_output_size); @@ -1265,7 +2112,7 @@ Status DavinciModel::SyncDataAndDump() { ret = SyncVarData(); } - (void)DumpOpInputOutput(op_list_, model_id_); // dump, not care result. + (void)DumpOpInputOutput(); // dump, not care result. return ret; } @@ -1277,25 +2124,25 @@ Status DavinciModel::SyncDataAndDump() { /// @return Status result /// @author /// -Status DavinciModel::ReturnResult(uint32_t model_id, uint32_t data_id, const bool rslt_flg, const bool seq_end_flag, +Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flag, OutputData *output_data) { - GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null!"); + GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null."); if (seq_end_flag) { - GELOGW("End of sequence, model id: %u", model_id); - GE_CHK_STATUS(listener_->OnComputeDone(model_id, data_id, END_OF_SEQUENCE), "OnComputeDone failed"); + GELOGW("End of sequence, model id: %u", model_id_); + GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, END_OF_SEQUENCE), "OnComputeDone failed"); return END_OF_SEQUENCE; } // return result is not required if (!rslt_flg) { - GELOGW("Compute failed, model id: %u", model_id); - GE_CHK_STATUS(listener_->OnComputeDone(model_id, data_id, INTERNAL_ERROR), "OnComputeDone failed"); + GELOGW("Compute failed, model id: %u", model_id_); + GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR), "OnComputeDone failed."); return INTERNAL_ERROR; } if (output_op_list_.empty()) { - GELOGW("Output tensor list is empty, model id: %u", model_id); - GE_CHK_STATUS(listener_->OnComputeDone(model_id, data_id, INTERNAL_ERROR), "OnComputeDone failed"); + GELOGW("Output tensor list is empty, model id: %u", model_id_); + GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR), "OnComputeDone failed."); return INTERNAL_ERROR; } @@ -1304,22 +2151,21 @@ Status DavinciModel::ReturnResult(uint32_t model_id, uint32_t data_id, const boo uint32_t data_index = 0; output_data->index = data_id; - output_data->model_id = model_id; + output_data->model_id = model_id_; // copy output data from op to designated position for (auto &op_desc : output_op_list_) { Status ret = ModelOutput::CopyResult(this, op_desc, *output_data, data_index, support_mem_shared_flag_); if (ret != SUCCESS) { GELOGE(INTERNAL_ERROR, "CopyResult failed, op name: %s", op_desc->GetName().c_str()); - GE_CHK_STATUS(listener_->OnComputeDone(model_id, data_id, INTERNAL_ERROR), "OnComputeDone failed"); + GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR), "OnComputeDone failed"); return INTERNAL_ERROR; } } - GE_IF_BOOL_EXEC((DumpOpInputOutput(op_list_, model_id) != SUCCESS), - GELOGW("dump op failed, model_id: %u", model_id);); + GE_IF_BOOL_EXEC((DumpOpInputOutput() != SUCCESS), GELOGW("dump op failed, model_id: %u", model_id_);); - GE_CHK_STATUS(listener_->OnComputeDone(model_id, data_id, SUCCESS), "OnComputeDone failed"); + GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS), "OnComputeDone failed"); return SUCCESS; } @@ -1328,18 +2174,18 @@ Status DavinciModel::ReturnResult(uint32_t model_id, uint32_t data_id, const boo /// @brief return not output to upper layer for cloud case /// @return Status result /// -Status DavinciModel::ReturnNoOutput(uint32_t model_id, uint32_t data_id) { - GELOGI("ReturnNoOutput model id:%u", model_id); - for (const auto &op_desc : variable_op_list_) { +Status DavinciModel::ReturnNoOutput(uint32_t data_id) { + GELOGI("ReturnNoOutput model id:%u", model_id_); + for (auto op_desc : variable_op_list_) { Status ret = VarManager::Instance(session_id_) ->SyncBroadCastData2Var(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_); - GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret fail, model id:%u, op name:%s", model_id, + GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_, op_desc->GetName().c_str()); } - GE_IF_BOOL_EXEC(DumpOpInputOutput(op_list_, model_id) != SUCCESS, GELOGW("dump op failed, model_id: %u", model_id);); + GE_IF_BOOL_EXEC((DumpOpInputOutput() != SUCCESS), GELOGW("dump op failed, model_id: %u", model_id_);); GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null!"); - GE_CHK_STATUS(listener_->OnComputeDone(model_id, data_id, SUCCESS), "OnComputeDone failed"); + GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS), "OnComputeDone failed."); return SUCCESS; } @@ -1349,34 +2195,31 @@ Status DavinciModel::ReturnNoOutput(uint32_t model_id, uint32_t data_id) { /// @param [in] op_list model_id /// @return Status result /// -Status DavinciModel::DumpOpInputOutput(map &op_list, uint32_t model_id) { - if (op_list.empty()) { +Status DavinciModel::DumpOpInputOutput() { + if (op_list_.empty()) { GELOGW("op_list is empty."); return FAILED; } -#ifdef FMK_SUPPORT_DUMP char *ge_dump_env = getenv("DUMP_OP"); - int dump_op_switch = (ge_dump_env != nullptr) ? std::strtol(ge_dump_env, nullptr, kDecimal) : 0; - // 10 for decimal number + int dump_op_switch = + (ge_dump_env != nullptr) ? std::strtol(ge_dump_env, nullptr, kDecimal) : 0; // 10 for decimal number if (dump_op_switch != 0) { int64_t cnt = 1; - for (auto it : op_list) { + for (auto it : op_list_) { if (maxDumpOpNum_ != 0 && cnt > maxDumpOpNum_) { GELOGW("dump op cnt > maxDumpOpNum, maxDumpOpNum: %ld.", maxDumpOpNum_); return SUCCESS; } - Status ret = DumpSingleOpInputOutput(it.second, model_id); + Status ret = DumpSingleOpInputOutput(it.second); cnt++; if (ret != SUCCESS) { - GELOGE(FAILED, "dump single op failed, model_id: %u", model_id); + GELOGE(FAILED, "dump single op failed, model_id: %u", model_id_); return FAILED; } } + } else { + GELOGW("need to set DUMP_OP for dump op input and output."); } -#else - GELOGW("need to define FMK_SUPPORT_DUMP for dump op input and output."); -#endif - return SUCCESS; } @@ -1386,24 +2229,31 @@ Status DavinciModel::DumpOpInputOutput(map &op_list, uint32 /// @param [in] dump_op model_id /// @return Status result /// -Status DavinciModel::DumpSingleOpInputOutput(const OpDescPtr &op_def, uint32_t model_id) { - GE_CHK_BOOL_EXEC(op_def != nullptr, return PARAM_INVALID, "op_def is null!"); - string op_name = StringUtils::ReplaceAll(op_def->GetName(), "/", "-"); - GELOGI("dump op name:%s, type:%s, model_id: %u", op_def->GetName().c_str(), op_def->GetType().c_str(), model_id); - string model_path = "./dump" + to_string(model_id); +Status DavinciModel::DumpSingleOpInputOutput(const OpDescPtr &op_def) { + GE_CHK_BOOL_EXEC(nullptr != op_def, return PARAM_INVALID, "op_def is null!"); + string op_name = ge::StringUtils::ReplaceAll(op_def->GetName(), "/", "-"); + GELOGI("dump op name:%s, type:%s, model_id: %u.", op_def->GetName().c_str(), op_def->GetType().c_str(), model_id_); + string model_path = "./dump" + to_string(model_id_); if (mmAccess(model_path.c_str()) != EN_OK) { int32_t ret = mmMkdir(model_path.c_str(), S_IRUSR | S_IWUSR | S_IXUSR); if (ret != EN_OK) { - GELOGE(FAILED, "make dir failed, model_id: %u", model_id); + GELOGE(FAILED, "make dir failed, model_id: %u", model_id_); return FAILED; } } - const vector input_size_vec = ModelUtils::GetInputSize(op_def); + const vector input_size_vec = ModelUtils::GetInputSize(op_def); const vector input_addr_vec = ModelUtils::GetInputDataAddrs(runtime_param_, op_def, false); + vector v_memory_type; + bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_def, ATTR_NAME_INPUT_MEM_TYPE_LIST, v_memory_type); + GELOGD("DumpSingleOp[%s], input size[%zu], input memory type size[%zu]", op_def->GetName().c_str(), + op_def->GetInputsSize(), v_memory_type.size()); for (size_t i = 0; i < input_addr_vec.size(); i++) { - uint32_t input_size = input_size_vec.at(i); + if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { + continue; + } + int64_t input_size = input_size_vec.at(i); char input_file_name[PATH_MAX] = {0}; - if ((sprintf_s(input_file_name, PATH_MAX, "%s/dump_%u_%s_%s_input_%zu.bin", model_path.c_str(), model_id, + if ((sprintf_s(input_file_name, PATH_MAX, "%s/dump_%u_%s_%s_input_%zu.bin", model_path.c_str(), model_id_, op_def->GetType().c_str(), op_name.c_str(), i)) == -1) { GELOGE(FAILED, "construct input dump file path failed."); return FAILED; @@ -1414,13 +2264,20 @@ Status DavinciModel::DumpSingleOpInputOutput(const OpDescPtr &op_def, uint32_t m } } - const vector output_size_vec = ModelUtils::GetOutputSize(op_def); + const vector output_size_vec = ModelUtils::GetOutputSize(op_def); const vector output_addr_vec = ModelUtils::GetOutputDataAddrs(runtime_param_, op_def, false); + v_memory_type.clear(); + has_mem_type_attr = ge::AttrUtils::GetListInt(op_def, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type); + GELOGD("DumpSingleOp[%s], output size[%zu], output memory type size[%zu]", op_def->GetName().c_str(), + op_def->GetOutputsSize(), v_memory_type.size()); if (!(op_def->GetType() == "Const")) { for (size_t i = 0; i < output_addr_vec.size(); i++) { - uint32_t output_size = output_size_vec.at(i); + if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { + continue; + } + int64_t output_size = output_size_vec.at(i); char output_file_name[PATH_MAX] = {0}; - if ((sprintf_s(output_file_name, PATH_MAX, "%s/dump_%u_%s_%s_output_%zu.bin", model_path.c_str(), model_id, + if ((sprintf_s(output_file_name, PATH_MAX, "%s/dump_%u_%s_%s_output_%zu.bin", model_path.c_str(), model_id_, op_def->GetType().c_str(), op_name.c_str(), i)) == -1) { GELOGE(FAILED, "construct output dump file path failed."); return FAILED; @@ -1442,7 +2299,7 @@ void *DavinciModel::Run(DavinciModel *model) { uint32_t model_id = model->Id(); uint32_t device_id = model->GetDeviceId(); - GELOGI("Model Run thread start, model_id:%u", model_id); + GELOGI("Model Run thread start, model_id:%u.", model_id); rtError_t rt_ret = rtSetDevice(static_cast(device_id)); if (rt_ret != RT_ERROR_NONE) { GELOGE(FAILED, "Model run rtsetdevice failed."); @@ -1470,44 +2327,47 @@ void *DavinciModel::Run(DavinciModel *model) { GE_IF_BOOL_EXEC(!model->RunFlag(), break); InputData current_data = data_wrapper->GetInput(); - GELOGI("Model thread Run begin, model id:%u, data index:%d.", model_id, current_data.index); + GELOGI("Model thread Run begin, model id:%u, data index:%u.", model_id, current_data.index); GE_TIMESTAMP_START(Model_SyncVarData); ret = model->SyncVarData(); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( - ret != SUCCESS, - (void)model->ReturnResult(model->model_id_, current_data.index, false, false, data_wrapper->GetOutput()); + ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "Copy input data to model failed."); // [No need to check value] GE_TIMESTAMP_END(Model_SyncVarData, "Model Run SyncVarData"); GELOGI("Copy input data, model id:%u", model_id); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), model->SetProfileTime(MODEL_PRE_PROC_START)); ret = model->CopyInputData(current_data, false); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( - ret != SUCCESS, - (void)model->ReturnResult(model->model_id_, current_data.index, false, false, data_wrapper->GetOutput()); + ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "Copy input data to model failed."); // [No need to check value] - + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), model->SetProfileTime(MODEL_PRE_PROC_END)); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), model->SetProfileTime(MODEL_INFER_START)); if (ProfilingManager::Instance().ProfilingOpTraceOn()) { GELOGI("GetOpTraceIterNum:%d", ProfilingManager::Instance().GetOpTraceIterNum()); for (int32_t i = 0; i < ProfilingManager::Instance().GetOpTraceIterNum(); i++) { if (!ProfilingManager::Instance().ProfilingLoadFlag()) { - (void)ProfilingManager::Instance().StartProfiling(i); // just profiling, no need to check value + vector prof_device_id_vec = ProfilingManager::Instance().GetProfilingDeviceId(); + for (size_t j = 0; j < prof_device_id_vec.size(); ++j) { + // just profiling, no need to check value + (void)ProfilingManager::Instance().StartProfiling(i, prof_device_id_vec[j]); + } } - // collect profiling for ge - ProfilingManager::Instance().ReportProfilingData(model->GetTaskIdOpName()); + GELOGI("rtModelExecute start."); - rtError_t rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; (void)model->ReturnResult( - model->model_id_, current_data.index, false, false, data_wrapper->GetOutput()); + rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; + (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); continue); // [No need to check value] GELOGI("rtModelExecute end"); GELOGI("rtStreamSynchronize start."); rt_ret = rtStreamSynchronize(model->rt_model_stream_); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; (void)model->ReturnResult( - model->model_id_, current_data.index, false, seq_end_flag, data_wrapper->GetOutput()); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; + (void)model->ReturnResult(current_data.index, false, seq_end_flag, data_wrapper->GetOutput()); continue); // [No need to check value] GELOGI("rtStreamSynchronize end."); (void)ProfilingManager::Instance().StopProfiling(); // just profiling, no need to check value @@ -1515,11 +2375,11 @@ void *DavinciModel::Run(DavinciModel *model) { } else { GE_TIMESTAMP_START(rtModelExecute); GELOGI("rtModelExecute start."); - rtError_t rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0); - GE_IF_BOOL_EXEC( - rt_ret != RT_ERROR_NONE, rslt_flg = false; - (void)model->ReturnResult(model->model_id_, current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue); + rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; + (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); + CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); + continue); GELOGI("rtModelExecute end"); GE_TIMESTAMP_END(rtModelExecute, "GraphExcute::rtModelExecute"); @@ -1531,27 +2391,25 @@ void *DavinciModel::Run(DavinciModel *model) { } GE_IF_BOOL_EXEC( rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag); - (void)model->ReturnResult(model->model_id_, current_data.index, false, seq_end_flag, + (void)model->ReturnResult(current_data.index, false, seq_end_flag, data_wrapper->GetOutput()); // [No need to check value] CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue); GELOGI("rtStreamSynchronize end."); GE_TIMESTAMP_END(rtStreamSynchronize, "GraphExcute::Wait for rtStreamSynchronize"); - - // collect profiling for ge - if (ProfilingManager::Instance().ProfilingOn()) { - ProfilingManager::Instance().ReportProfilingData(model->GetTaskIdOpName()); - } + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), model->SetProfileTime(MODEL_INFER_END)); } + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), model->SetProfileTime(MODEL_AFTER_PROC_START)); GE_TIMESTAMP_START(ReturnResult3); // copy output data from device to host - GE_IF_BOOL_EXEC( - !model->output_op_list_.empty(), - (void)model->ReturnResult(model->model_id_, current_data.index, rslt_flg, false, data_wrapper->GetOutput())) + GE_IF_BOOL_EXEC(!model->output_op_list_.empty(), + (void)model->ReturnResult(current_data.index, rslt_flg, false, data_wrapper->GetOutput())) // copy output data from device to host for variable graph - GE_IF_BOOL_EXEC(model->output_op_list_.empty(), (void)model->ReturnNoOutput(model->model_id_, current_data.index)); + GE_IF_BOOL_EXEC(model->output_op_list_.empty(), (void)model->ReturnNoOutput(current_data.index)); GE_TIMESTAMP_END(ReturnResult3, "GraphExcute::CopyDataFromDeviceToHost"); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), model->SetProfileTime(MODEL_AFTER_PROC_END)); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), (void)model->SinkTimeProfile(current_data)); model->iterator_count_++; GELOGI("run iterator count is %lu", model->iterator_count_); @@ -1559,7 +2417,7 @@ void *DavinciModel::Run(DavinciModel *model) { CsaInteract::GetInstance().WriteInternalErrorCode(); GELOGI("Model run end, model id:%u", model->model_id_); - GEEVENT("Model Run thread end, model_id:%u", model->model_id_); + GEEVENT("Model Run thread end, model_id:%u.", model->model_id_); return nullptr; } @@ -1571,7 +2429,7 @@ void *DavinciModel::Run(DavinciModel *model) { /// @author /// Status DavinciModel::DestroyThread() { - GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr!"); + GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr."); run_flg_ = false; @@ -1593,15 +2451,15 @@ Status DavinciModel::DestroyThread() { /// @author /// Status DavinciModel::ModelRunStart() { - GE_CHK_BOOL_RET_STATUS((DavinciModel::GetSysMode() != RESET) && (DavinciModel::GetSysMode() != STOP), INTERNAL_ERROR, + GE_CHK_BOOL_RET_STATUS((RESET != DavinciModel::GetSysMode()) && (STOP != DavinciModel::GetSysMode()), INTERNAL_ERROR, "Model Start FAIL in wrong sys mode!"); - GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr!"); + GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr."); LockRunFlg(); GE_MAKE_GUARD(tmp_lock, [&] { UnlockRunFlg(); }); - GE_CHK_BOOL_RET_STATUS(!run_flg_, INTERNAL_ERROR, "Model already started!"); + GE_CHK_BOOL_RET_STATUS(!run_flg_, INTERNAL_ERROR, "Model already started."); run_flg_ = true; @@ -1615,7 +2473,7 @@ Status DavinciModel::ModelRunStart() { maxDumpOpNum_ = maxDumpOpNum; CREATE_STD_THREAD(thread_id_, DavinciModel::Run, this); - GELOGI("model tread create success, model id:%u", model_id_); + GELOGI("model tread create success, model id:%u.", model_id_); return SUCCESS; } @@ -1635,7 +2493,7 @@ Status DavinciModel::ModelRunStop() { GE_IF_BOOL_EXEC(!run_flg_, return SUCCESS); - GE_CHK_STATUS_RET(DestroyThread(), "DestoyThead failed!"); + GE_CHK_STATUS_RET(DestroyThread(), "DestoyThead failed."); return SUCCESS; } @@ -1650,8 +2508,11 @@ void DavinciModel::UnbindTaskSinkStream() { } if (is_inner_model_stream_) { + if (!input_queue_ids_.empty() || !output_queue_ids_.empty()) { + GE_LOGW_IF(rtModelUnbindStream(rt_model_handle_, rt_model_stream_) != RT_ERROR_NONE, "Unbind stream failed!"); + } // destroy stream that is bound with rt_model - GE_LOGW_IF(rtStreamDestroy(rt_model_stream_) != RT_ERROR_NONE, "Destroy stream for rt_model failed!") + GE_LOGW_IF(rtStreamDestroy(rt_model_stream_) != RT_ERROR_NONE, "Destroy stream for rt_model failed.") } return; } @@ -1660,8 +2521,7 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) { GELOGI("InitTaskInfo in,task size %zu", model_task_def.task().size()); task_list_.resize(model_task_def.task_size()); std::vector> futures(model_task_def.task_size()); - constexpr uint32_t thread_num = THREAD_NUM; - ThreadPool executor(thread_num); + ThreadPool executor(kThreadNum); rtContext_t ctx = nullptr; rtError_t rt_ret = rtCtxGetCurrent(&ctx); if (rt_ret != RT_ERROR_NONE || ctx == nullptr) { @@ -1670,6 +2530,9 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) { } for (int32_t i = 0; i < model_task_def.task_size(); ++i) { + if (model_task_def.task(i).type() == static_cast(RT_MODEL_TASK_MODEL_END_GRAPH)) { + end_graph_id_ = i; + } std::future f = executor.commit( [](const domi::TaskDef &task, DavinciModel *model, rtContext_t ctx, int32_t idx) -> Status { rtError_t rt_ret = rtCtxSetCurrent(ctx); @@ -1698,7 +2561,7 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) { for (size_t i = 0; i < futures.size(); ++i) { ret = futures[i].get(); if (ret != SUCCESS) { - GELOGE(ret, "Task index %zu init fail.", i); + GELOGE(ret, "Task index %zu init failed.", i); return ret; } } @@ -1709,21 +2572,25 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) { Status DavinciModel::DistributeTask() { GELOGI("do Distribute."); - - op_task_id_map_.clear(); - Status ret; - for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) { - auto &task = task_list_.at(task_index); + for (auto &task : cpu_task_list_) { if (task == nullptr) { GELOGW("task is null"); continue; } - ret = task->Distribute(); - if (ret != SUCCESS) { - GELOGE(ret, "Distribute Fail!"); - return ret; - } + GE_CHK_STATUS_RET(task->Distribute()); + } + + task_desc_info_.clear(); + bool flag = GetL1FusionEnableOption(); + char *skt_enable_env = getenv("SKT_ENABLE"); + int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0; + if (env_flag != 0) { + flag = true; + } + for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) { + auto &task = task_list_.at(task_index); + GE_CHK_STATUS_RET(task->Distribute(), "Task[%zu] distribute fail", task_index); // for data dump if (reinterpret_cast(task->GetDumpArgs()) != nullptr) { auto op_index = std::max(model_task_def_->task(task_index).kernel().context().op_index(), @@ -1735,7 +2602,7 @@ Status DavinciModel::DistributeTask() { } if (PropertiesManager::Instance().IsLayerNeedDump(name_, op->GetName())) { - data_dumper_.SaveDumpTask(task->GetTaskID(), op, task->GetDumpArgs()); + SaveDumpTask(task->GetTaskID(), op, task->GetDumpArgs()); } } @@ -1747,74 +2614,185 @@ Status DavinciModel::DistributeTask() { } // else task index is found in op_name_map_ + TaskDescInfo task_desc_info; string op_name = op_name_map_[task_index]; - op_task_id_map_[task->GetTaskID()] = op_name; + task_desc_info.op_name = op_name; + task_desc_info.block_dim = model_task_def_->task(task_index).kernel().block_dim(); + task_desc_info.task_id = task->GetTaskID(); + task_desc_info.stream_id = task->GetStreamId(); + task_desc_info_.emplace_back(task_desc_info); + if (flag) { + if (task->GetSktTaskID() != 0xFFFFFFFF) { + TaskDescInfo task_desc_info; + string op_name = "super_kernel_" + to_string(task_index); + task_desc_info.op_name = op_name; + task_desc_info.task_id = task->GetSktTaskID(); + task_desc_info_.emplace_back(task_desc_info); + } + } } } - + AddEndGraphToTaskList(); // launch dump kernel to aicpu - ret = data_dumper_.LoadDumpInfo(); - if (ret != SUCCESS) { - GELOGE(ret, "Load dump info fail."); - return ret; + GE_CHK_STATUS_RET(data_dumper_.LoadDumpInfo(), "Load dump info failed."); + return SUCCESS; +} + +void DavinciModel::AddEndGraphToTaskList() { + auto all_dump_model = PropertiesManager::Instance().GetAllDumpModel(); + if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || + all_dump_model.find(name_) != all_dump_model.end()) { + if (end_graph_id_ != 0xFFFFFFFF && end_graph_op_ != nullptr) { + data_dumper_.SaveDumpTask(task_list_[end_graph_id_]->GetTaskID(), end_graph_op_, 0); + GELOGI("The type of op is %s and the task id is %u", end_graph_op_->GetType().c_str(), + task_list_[end_graph_id_]->GetTaskID()); + } else { + GELOGD("There are no end graph node in the graph"); + } } +} - return SUCCESS; +/// +/// @ingroup ge +/// @brief Save Data address info for ZeroCopy. +/// @param [in] const std::vector &outside_addrs +/// @return None. +/// +void DavinciModel::SetInputOutsideAddr(const std::vector &outside_addrs) { + for (auto addr : outside_addrs) { + if (input_outside_addrs_.find(addr) != input_outside_addrs_.end()) { + continue; + } + + (void)input_outside_addrs_.emplace(std::pair>(addr, {})); + GELOGI("SetInputOutsideAddr success."); + } } /// -/// @ingroup domi_ome -/// @brief Save Data and NetOutput address info for ZeroCopy. +/// @ingroup ge +/// @brief Save NetOutput address info for ZeroCopy. /// @param [in] const std::vector &outside_addrs /// @return None. /// -void DavinciModel::SetOutsideAddr(const std::vector &outside_addrs) { +void DavinciModel::SetOutputOutsideAddr(const std::vector &outside_addrs) { for (auto addr : outside_addrs) { - if (outside_addrs_.find(addr) != outside_addrs_.end()) { + if (output_outside_addrs_.find(addr) != output_outside_addrs_.end()) { continue; } - (void)outside_addrs_.emplace(std::pair>(addr, {})); - GELOGI("SetOutsideAddr success."); + (void)output_outside_addrs_.emplace(std::pair>(addr, {})); + GELOGI("SetOutputOutsideAddr success."); } } /// -/// @ingroup domi_ome +/// @ingroup ge /// @brief Save outside address used info for ZeroCopy. +/// @param [in] const OpDescPtr &op_desc: current op desc /// @param [in] const std::vector &outside_addrs: address of task /// @param [in] const char *args_offset: arguments address save the address. /// @return None. /// -void DavinciModel::SetZeroCopyAddr(const std::vector &outside_addrs, void *args_offset) { +void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector &outside_addrs, + void *args_offset) { + // Internal call has ensured that op_desc is not nullptr + int64_t op_id = op_desc->GetId(); size_t nums = outside_addrs.size(); for (size_t i = 0; i < nums; ++i) { std::lock_guard lock(outside_addrs_mutex_); - auto it = outside_addrs_.find(outside_addrs[i]); - if (it == outside_addrs_.end()) { + auto input_iter = input_outside_addrs_.find(outside_addrs[i]); + if (input_iter != input_outside_addrs_.end()) { + input_iter->second.push_back(static_cast(args_offset) + i * sizeof(void *)); + GELOGI("SetZeroCopyAddr of input outside_addrs."); + } + auto output_iter = output_outside_addrs_.find(outside_addrs[i]); + if (output_iter != output_outside_addrs_.end()) { + output_iter->second.push_back(static_cast(args_offset) + i * sizeof(void *)); + GELOGI("SetZeroCopyAddr of output outside_addrs."); + } + + // Establish a mapping between batch label and zero copy address for multi-batch scenes + if (zero_copy_op_id_batch_label_.find(op_id) == zero_copy_op_id_batch_label_.end()) { continue; } + std::string batch_label = zero_copy_op_id_batch_label_.find(op_id)->second; + auto iter = zero_copy_batch_label_addrs_.find(batch_label); + if (iter != zero_copy_batch_label_addrs_.end()) { + iter->second.push_back(static_cast(args_offset) + i * sizeof(void *)); + GELOGD("Set zero copy batch label and addrs success, batch label: %s", batch_label.c_str()); + } else { + std::vector addrs; + addrs.emplace_back(static_cast(args_offset) + i * sizeof(void *)); + zero_copy_batch_label_addrs_.emplace(pair>(batch_label, addrs)); + GELOGD("New added zero copy batch label and addrs success, batch label: %s", batch_label.c_str()); + } + } +} + +/// +/// @ingroup ge +/// @brief Copy Check input size and model op size. +/// @param [in] const int64_t &input_size: input size. +/// @param [in] const int64_t &op_size: model op size. +/// @param [in] is_dynamic_input: dynamic batch input flag. +/// @return true if success +/// +bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64_t &op_size, bool is_dynamic_input) { + if (is_dynamic_input) { + GELOGI("No need to check input and model size."); + return true; + } - it->second.push_back(static_cast(args_offset) + i * sizeof(void *)); - GELOGI("SetZeroCopyAddr of outside_addrs."); + if (input_size > op_size) { + GELOGE(FAILED, "Input size [%u] can not be bigger than op size [%u]", input_size, op_size); + return false; + } + bool is_dynamic_aipp = false; + for (const auto &op_desc : data_op_list_) { + if (op_desc->GetType() == AIPP_DATA_TYPE) { + GELOGI("This is dynamic aipp model."); + is_dynamic_aipp = true; + break; + } + } + if (is_dynamic_aipp) { + GELOGI("This is dynamic aipp model, no need to judge smaller input size"); + return true; + } + // Judge overflow first + if (input_size > (INT64_MAX - kDataMemAlignSizeCompare)) { + GELOGI("The Input size [%ld] is smaller than model size [%ld] and is in the range of 64 bytes", input_size, + op_size); + return true; } + // The input and model input size can not be exactly equal because user input is not definite. + if ((input_size + kDataMemAlignSizeCompare) < op_size) { + GELOGE(FAILED, "Input size [%ld] can not be smaller than op size [%ld] after 64-byte alignment", input_size, + op_size); + return false; + } + return true; } /// -/// @ingroup domi_ome +/// @ingroup ge /// @brief Copy Inputs and Outputs addr to model for direct use. /// @param [in] const domi::InputData &input_data: model input data. /// @param [in] domi::OutputData &output_data: model output data. +/// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input /// @return SUCCESS handle successfully / PARAM_INVALID for failed /// -Status DavinciModel::ModelZeroCopy(const InputData &input_data, OutputData &output_data) { - if (ZeroCopyInput(input_data) != SUCCESS) { - GELOGE(PARAM_INVALID, "ZeroCopyInput failed."); +Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic_input) { + if (ZeroCopyBlobs(input_addr_list_, input_size_list_, input_data.blobs, is_dynamic_input, kInputZeroCopy, + input_data.batch_label) != SUCCESS) { + GELOGE(PARAM_INVALID, "Copy input data to model failed."); return PARAM_INVALID; } - if (ZeroCopyOutput(output_data) != SUCCESS) { - GELOGE(PARAM_INVALID, "ZeroCopyOutput failed."); + if (ZeroCopyBlobs(output_addr_list_, output_size_list_, output_data.blobs, is_dynamic_input, kOutputZeroCopy, + input_data.batch_label) != SUCCESS) { + GELOGE(PARAM_INVALID, "Copy output data to model failed."); return PARAM_INVALID; } @@ -1824,108 +2802,103 @@ Status DavinciModel::ModelZeroCopy(const InputData &input_data, OutputData &outp } /// -/// @ingroup domi_ome +/// @ingroup ge /// @brief Copy Data addr to model for direct use. -/// @param [in] const domi::InputData &input_data: model input data info. +/// @param [in] const vector &addrs: model input memory addr list. +/// @param [in] const vector &sizes: model input memory size list. +/// @param [in] const std::vector &blobs: user input data list. +/// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input +/// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy +/// @param [in] string batch_label: batch label for multi-batch scenes /// @return SUCCESS handle successfully / others handle failed /// -Status DavinciModel::ZeroCopyInput(const InputData &input_data) { - GE_CHK_BOOL_RET_STATUS(!data_op_list_.empty(), SUCCESS, "data_op_list_ is empty!"); - GE_CHK_BOOL_RET_STATUS(data_op_list_.size() == input_data.blobs.size(), PARAM_INVALID, - "The input data list size (%zu) does not match the model input list size (%zu)", - input_data.blobs.size(), data_op_list_.size()); - - const std::vector &blobs = input_data.blobs; - for (size_t data_op_index = 0; data_op_index < data_op_list_.size(); ++data_op_index) { - auto op_desc = data_op_list_[data_op_index]; - GE_CHK_BOOL_EXEC(op_desc != nullptr, return PARAM_INVALID, "op_desc is null!"); - - auto data_index = static_cast(data_op_index); - if (AttrUtils::GetInt(op_desc, "index", data_index)) { - GELOGI("ge_train:get new index %u , old %zu", data_index, data_op_index); - } - GE_CHK_BOOL_EXEC(data_index < blobs.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, blobs.size()); - GE_CHK_BOOL_RET_STATUS(op_desc->GetInputsSize() == 1 && op_desc->GetOutputsSize() == 1, PARAM_INVALID, - "Data Op has invalid input_desc_size(%zu) or output_desc_size(%zu)", - op_desc->GetInputsSize(), op_desc->GetOutputsSize()); - - uint32_t input_size = 0; - const DataBuffer &data_buf = blobs[data_index]; - GE_CHK_STATUS(TensorUtils::GetSize(*op_desc->GetInputDescPtr(0), input_size), "get input size failed."); - GE_CHK_BOOL_RET_STATUS(input_size >= data_buf.length, PARAM_INVALID, - "input data size(%u) does not match model required size(%u), ret fail.", data_buf.length, - input_size); - - const vector &outputs = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc); +Status DavinciModel::ZeroCopyBlobs(const std::vector &addr_list, const std::vector &size_list, + const std::vector &blobs, bool is_dynamic_input, + ZeroCopyMode zero_copy_mode, std::string batch_label) { + if ((blobs.size() != addr_list.size()) || (blobs.size() != size_list.size())) { + GELOGE(FAILED, "Blobs not match: blobs=%zu addr=%zu size=%zu", blobs.size(), addr_list.size(), size_list.size()); + return FAILED; + } + + for (size_t idx = 0; idx < size_list.size(); ++idx) { + const DataBuffer &data_buf = blobs[idx]; if (data_buf.data == nullptr) { - GELOGE(INTERNAL_ERROR, "data_buf.data is nullptr"); - return INTERNAL_ERROR; + GELOGE(FAILED, "data_buf.data is nullptr, index=%zu", idx); + return FAILED; } - if (ZeroCopyImpl(outputs[0], data_buf) != SUCCESS) { + GELOGI("Copy Blobs %zu: Input data length is %u, Op data size is %u.", idx, data_buf.length, size_list[idx]); + + if (!CheckInputAndModelSize(data_buf.length, size_list[idx], is_dynamic_input)) { + GELOGE(FAILED, "Check input size and model size failed"); return FAILED; } + + if (!is_dynamic_input) { + zero_copy_batch_label_addrs_.clear(); + } + + if (zero_copy_mode == kInputZeroCopy) { + if (ZeroCopyInputBlobs(addr_list[idx], size_list[idx], data_buf, zero_copy_mode, batch_label) != SUCCESS) { + GELOGE(FAILED, "Zero copy input blobs failed"); + return FAILED; + } + } + + if (zero_copy_mode == kOutputZeroCopy && !is_dynamic_input) { + if (ZeroCopyImpl(addr_list[idx], data_buf, zero_copy_mode, batch_label) != SUCCESS) { + GELOGE(FAILED, "Output zero copy data node copy failed"); + return FAILED; + } + } } return SUCCESS; } /// -/// @ingroup domi_ome -/// @brief Copy NetOutput addr to model for direct use. -/// @param [in] const domi::OutputData &output_data: model output data info. +/// @ingroup ge +/// @brief Copy input addr to model for direct use. +/// @param [in] void *addr: model input memory addr. +/// @param [in] uint32_t size: model input memory size. +/// @param [in] const DataBuffer &data_buffer: user input data. +/// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input +/// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy +/// @param [in] string batch_label: batch label for multi-batch scenes /// @return SUCCESS handle successfully / others handle failed /// -Status DavinciModel::ZeroCopyOutput(const OutputData &output_data) { - GE_CHK_BOOL_RET_STATUS(output_data.blobs.size() == output_size_list_.size(), INTERNAL_ERROR, - "output buffer size[%zu] not equal output_size_list[%zu] size!", output_data.blobs.size(), - output_size_list_.size()); - - // index of data in output_data - uint32_t output_data_index = 0; - const std::vector &blobs = output_data.blobs; - for (auto &op_desc : output_op_list_) { - Output model_output(op_desc, this); - GE_CHK_BOOL_RET_STATUS(model_output.Init() == SUCCESS, PARAM_INVALID, "init model_output failed"); - vector v_output_size = ModelUtils::GetInputSize(op_desc); - vector v_output_data_addr = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc); - - // for all output tensor, copy output data from op to designated position - for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { - GE_CHK_BOOL_RET_STATUS(output_data_index < blobs.size(), PARAM_INVALID, - "The blobs size:%zu, data_op size:%zu, curr output size:%zu", blobs.size(), - data_op_list_.size(), op_desc->GetOutputsSize()); - const DataBuffer &data_buf = blobs[output_data_index]; - output_data_index++; - uint32_t size = data_buf.length; - GE_CHK_BOOL_RET_STATUS(size <= v_output_size[i], PARAM_INVALID, - "Model output data size(%u) does not match required size(%u).", v_output_size[i], - data_buf.length); - - GELOGI("ZeroCopyOutput memcpy graph_%u type[F] name[%s] output[%lu] memsize[%u] datasize[%u]", - runtime_param_.graph_id, op_desc->GetName().c_str(), i, data_buf.length, v_output_size[i]); - if (ZeroCopyImpl(v_output_data_addr[i], data_buf) != SUCCESS) { - return FAILED; - } +Status DavinciModel::ZeroCopyInputBlobs(void *addr, int64_t size, const DataBuffer &data_buffer, + ZeroCopyMode zero_copy_mode, string batch_label) { + auto iter = input_outside_addrs_.find(addr); + if (iter == input_outside_addrs_.end()) { + GELOGE(FAILED, "Can not find addr in input outside addrs"); + return FAILED; + } + const auto &used_zero_copy_list = iter->second; + if (used_zero_copy_list.empty()) { + if (rtMemcpy(addr, size, data_buffer.data, data_buffer.length, RT_MEMCPY_DEVICE_TO_DEVICE) != RT_ERROR_NONE) { + GELOGE(FAILED, "Non-zero copy data node copy failed"); + return FAILED; + } + } else { + if (ZeroCopyImpl(addr, data_buffer, zero_copy_mode, batch_label) != SUCCESS) { + GELOGE(FAILED, "Input zero copy data node copy failed"); + return FAILED; } } - return SUCCESS; } /// -/// @ingroup domi_ome +/// @ingroup ge /// @brief Copy address to args_ space for direct use. /// @param [in] const void *src_addr: source address of the Op. /// @param [in] const void *dst_addr: destination address of user data. +/// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy +/// @param [in] string batch_label: batch label for multi-batch scenes /// @return SUCCESS handle successfully / others handle failed /// -Status DavinciModel::ZeroCopyImpl(const void *src_addr, const DataBuffer &data_buf) { - auto it = outside_addrs_.find(src_addr); - if (it == outside_addrs_.end()) { - GELOGE(FAILED, "ZeroCopyImpl failed to find outside_addrs."); - return FAILED; - } - +Status DavinciModel::ZeroCopyImpl(const void *src_addr, const DataBuffer &data_buf, ZeroCopyMode zero_copy_mode, + std::string batch_label) { auto dst_addr = static_cast(data_buf.data); auto dst_size = static_cast(data_buf.length); Status ret = ModelUtils::ConvertVirtualAddressToPhysical(dst_addr, dst_size, dst_addr); @@ -1934,13 +2907,46 @@ Status DavinciModel::ZeroCopyImpl(const void *src_addr, const DataBuffer &data_b return FAILED; } - for (auto &addr : it->second) { + map>::iterator iter; + if (zero_copy_mode == kInputZeroCopy) { + iter = input_outside_addrs_.find(src_addr); + if (iter == input_outside_addrs_.end()) { + GELOGE(FAILED, "ZeroCopyImpl failed to find input outside_addrs."); + return FAILED; + } + } + + if (zero_copy_mode == kOutputZeroCopy) { + iter = output_outside_addrs_.find(src_addr); + if (iter == output_outside_addrs_.end()) { + GELOGE(FAILED, "ZeroCopyImpl failed to find output outside_addrs."); + return FAILED; + } + } + + // Used for dynamic batch/resolution scene + vector dynamic_input_addrs; + auto dynamic_input_iter = zero_copy_batch_label_addrs_.find(batch_label); + if (dynamic_input_iter != zero_copy_batch_label_addrs_.end()) { + dynamic_input_addrs = dynamic_input_iter->second; + } + vector fix_input_addrs; + auto fix_input_iter = zero_copy_batch_label_addrs_.find(kDefaultBatchLable); + if (fix_input_iter != zero_copy_batch_label_addrs_.end()) { + fix_input_addrs = fix_input_iter->second; + } + + for (auto &addr : iter->second) { + if (!CheckDynamicBatchZeroCopyAddr(addr, dynamic_input_addrs, fix_input_addrs)) { + continue; + } __builtin_prefetch(addr); rtError_t rt_err = rtMemcpy(addr, sizeof(void *), &dst_addr, sizeof(void *), RT_MEMCPY_HOST_TO_DEVICE); if (rt_err != RT_ERROR_NONE) { GELOGE(FAILED, "ZeroCopyImpl: rtMemcpy failed"); return FAILED; } + GELOGI("[IMAS]refresh in/out addr new:%p, old:%p", dst_addr, src_addr); } return SUCCESS; @@ -1956,7 +2962,7 @@ Status DavinciModel::ZeroCopyImpl(const void *src_addr, const DataBuffer &data_b void DavinciModel::GetUniqueId(const OpDescPtr &op_desc, std::string &unique_identification) { std::string session_graph_id; GE_IF_BOOL_EXEC(AttrUtils::GetStr(*op_desc, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id), - GELOGI("Get original type of session_graph_id.")); + GELOGD("Get original type of session_graph_id.")); if (session_graph_id.empty()) { return; } else if (session_graph_id.find("-1") != string::npos) { @@ -2003,7 +3009,7 @@ Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const { GeTensor *tensor = const_cast(v_weights[0].get()); GE_IF_BOOL_EXEC( - v_output_size[0] < tensor->GetData().size(), + static_cast(v_output_size[0]) < tensor->GetData().size(), GELOGE(PARAM_INVALID, "output size:%u less than weight data size:%zu", v_output_size[0], tensor->GetData().size()); return PARAM_INVALID;); @@ -2019,7 +3025,8 @@ Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const { /// unknown shape will not appear here, so we can use zero judge a tensor is scaler or not int64_t elem_num = tensor_shape.GetShapeSize() == 0 ? 1 : tensor_shape.GetShapeSize(); uint64_t *buff = reinterpret_cast(tensor->MutableData().data()); - GE_CHK_BOOL_RET_STATUS(CheckInt64Uint32MulOverflow(elem_num, kBytes) == SUCCESS, FAILED, "Shape size is invalid"); + GE_CHK_BOOL_RET_STATUS(ge::CheckInt64Uint32MulOverflow(elem_num, kBytes) == SUCCESS, FAILED, + "Shape size is invalid"); int64_t offset = elem_num * kBytes; uint64_t hbm_raw_data_base_addr = reinterpret_cast(v_output_addr[0]) + offset; @@ -2038,7 +3045,6 @@ Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const { /// @brief TVM Op Init. /// @return Status /// - Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) { TBEKernelPtr tbe_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); if (tbe_kernel == nullptr) { @@ -2143,9 +3149,9 @@ Status DavinciModel::MarkActiveStream(const OpDescPtr &op_desc) { GE_IF_BOOL_EXEC( type == STREAMSWITCH, std::vector active_stream_list; GE_LOGI_IF(!ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), - "GetInt ACTIVE_STREAM_LIST fail."); - if (active_stream_list.size() != TRUE_BRANCH_STREAM_NUM) { - GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", TRUE_BRANCH_STREAM_NUM); + "GetInt ACTIVE_STREAM_LIST failed."); + if (active_stream_list.size() != kTrueBranchStreamNum) { + GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum); return INTERNAL_ERROR; } uint32_t true_stream_id = active_stream_list.front(); active_stream_indication_.insert(true_stream_id); @@ -2154,7 +3160,7 @@ Status DavinciModel::MarkActiveStream(const OpDescPtr &op_desc) { type == STREAMACTIVE, if (op_desc->HasAttr(ATTR_NAME_SWITCH_BRANCH_NODE_LABEL)) { std::vector active_stream_list; GE_CHK_BOOL_EXEC(AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), - return INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM fail."); + return INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM failed."); for (size_t j = 0; j < active_stream_list.size(); ++j) { active_stream_indication_.insert(active_stream_list[j]); @@ -2162,13 +3168,28 @@ Status DavinciModel::MarkActiveStream(const OpDescPtr &op_desc) { active_stream_list[j]); } }); + + if (type == STREAMSWITCHN) { + std::vector active_stream_list; + if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) { + GELOGE(INTERNAL_ERROR, "StreamSwitchNOp get attr ACTIVE_STREAM failed."); + return INTERNAL_ERROR; + } + + for (size_t j = 0; j < active_stream_list.size(); ++j) { + active_stream_indication_.insert(active_stream_list[j]); + GELOGI("StreamSwitchNOp node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]); + }; + } + GELOGI("Flow control: active_stream_indication_ size = %zu.", active_stream_indication_.size()); + return SUCCESS; } bool DavinciModel::IsBroadCastOpData(const ge::NodePtr &var_node) { - for (const auto &out_anchor : var_node->GetAllOutDataAnchors()) { + for (auto out_anchor : var_node->GetAllOutDataAnchors()) { GE_RT_FALSE_CHECK_NOTNULL(out_anchor); - for (const auto &in_anchor : out_anchor->GetPeerInDataAnchors()) { + for (auto in_anchor : out_anchor->GetPeerInDataAnchors()) { GE_RT_FALSE_CHECK_NOTNULL(in_anchor); ge::NodePtr dst_node = in_anchor->GetOwnerNode(); GE_RT_FALSE_CHECK_NOTNULL(dst_node); @@ -2198,10 +3219,7 @@ Status DavinciModel::InitModelStream(rtStream_t stream, bool async_mode) { // synchronize mode, use forbidden stream. if (stream != nullptr) { if ((rt_model_stream_ != nullptr) && is_inner_model_stream_) { - if (rtStreamDestroy(rt_model_stream_) != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Destroy rt_stream failed!"); - return FAILED; - } + GE_LOGW_IF(rtStreamDestroy(rt_model_stream_) != RT_ERROR_NONE, "Destroy rt_stream failed!"); } rt_model_stream_ = stream; @@ -2227,35 +3245,66 @@ Status DavinciModel::InitModelStream(rtStream_t stream, bool async_mode) { /// Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputData &input_data, OutputData &output_data) { - GELOGI("Model Run begin, model id:%u, data index:%d, flag:%d.", model_id_, input_data.index, async_mode); - GE_CHK_STATUS(InitModelStream(stream, async_mode), "Init model stream fail."); + GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, async_mode); + GE_CHK_STATUS(InitModelStream(stream, async_mode), "Init model stream failed."); GELOGI("do rtModelExecute task sink, model id:%u", input_data.model_id); - Status ret = ModelZeroCopy(input_data, output_data); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy input data to model failed."); - GELOGI("current_data.index=%u", input_data.index); + auto enable_dump = false; + auto dump_path = PropertiesManager::Instance().GetDumpOutputPath(); + if (!dump_path.empty()) { + enable_dump = true; + } - GELOGD("rtModelExecute do"); + auto dump_op_env = std::getenv("DUMP_OP"); + if (dump_op_env != nullptr) { + string dump_op_flag(dump_op_env); + if (dump_op_flag == "1") { + enable_dump = true; + } + } + GELOGI("dump path: %s, dump_op_env: %s", dump_path.c_str(), dump_op_env); + bool is_dynamic_batch = input_data.is_dynamic_batch; + if (is_dynamic_batch) { + input_use_zero_copy_ = true; + output_use_zero_copy_ = false; + } - rtError_t rt_ret = rtModelExecute(rt_model_handle_, rt_model_stream_, 0); - GE_CHK_RT_EXEC(rt_ret, return INTERNAL_ERROR); - GELOGI("rtModelExecute end"); + if (enable_dump) { + input_use_zero_copy_ = false; + output_use_zero_copy_ = false; + } - if (async_mode) { - rt_ret = rtStreamSynchronize(rt_model_stream_); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, return INTERNAL_ERROR); + // Asynchronous mode depends on zero copy. + if (async_mode && !input_use_zero_copy_ && !output_use_zero_copy_ && !task_list_.empty()) { + GELOGE(FAILED, "Asynchronous mode but zero copy disabled."); + return FAILED; } - ret = SyncDataAndDump(); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy Output data to user failed."); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_PRE_PROC_START)); + Status ret = + input_use_zero_copy_ ? CopyModelData(input_data, output_data, is_dynamic_batch) : CopyInputData(input_data, true); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy input data to model failed."); - // collect profiling for ge - if (ProfilingManager::Instance().ProfilingOn()) { - ProfilingManager::Instance().ReportProfilingData(op_task_id_map_); - GELOGI("Acl Profiling Op name taskId report."); + GELOGI("current_data.index=%u", input_data.index); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_PRE_PROC_END)); + + if (!task_list_.empty()) { + GELOGD("rtModelExecute do"); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_INFER_START)); + rtError_t rt_ret = rtModelExecute(rt_model_handle_, rt_model_stream_, 0); + GE_CHK_RT_EXEC(rt_ret, return INTERNAL_ERROR); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_INFER_END)); + GELOGI("rtModelExecute end"); } + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_START)); + ret = output_use_zero_copy_ ? SyncDataAndDump() : CopyOutputData(input_data.index, output_data); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy Output data to user failed."); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_END)); + + // report model time data + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), (void)SinkTimeProfile(input_data)); GELOGI("Model run end, model id:%u", model_id_); GEEVENT("Model Run thread end, model_id:%u", model_id_); return SUCCESS; @@ -2335,7 +3384,7 @@ uint32_t DavinciModel::GetGraphID(const std::string &session_graph_id) { Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id) { GELOGI("TransAllVarData start: session_id:%lu, graph_id: %u.", session_id_, graph_id); - ThreadPool executor(THREAD_NUM); + ThreadPool executor(kThreadNum); std::vector> vector_future; rtContext_t ctx = nullptr; @@ -2381,7 +3430,7 @@ Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id) GELOGI("The variable %s does not have any trans road", node->GetName().c_str()); return SUCCESS; } - ret = TransVarData(node, *trans_road, model->session_id_, model->device_id_); + ret = TransVarData(node, *trans_road, model->session_id_); if (ret != SUCCESS) { GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id); return INTERNAL_ERROR; @@ -2421,7 +3470,7 @@ void DavinciModel::SetDataDumperArgs() { int32_t device_id = 0; rtError_t rt_ret = rtGetDevice(&device_id); if (rt_ret != RT_ERROR_NONE || device_id < 0) { - GELOGE(RT_FAILED, "Call rtGetDevice fail, ret = 0x%X, device_id = %d.", rt_ret, device_id); + GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); return; } data_dumper_.SetDeviceId(device_id); @@ -2533,7 +3582,7 @@ Status DavinciModel::CopyVarData(ComputeGraphPtr &compute_graph) { GE_IF_BOOL_EXEC(ge::AttrUtils::GetStr(node->GetOpDesc(), "_copy_from_var_node", cp_from_node), GELOGI("Get original type of cp_from_node")); if (cp_from_node.length() != 0) { - (void)ge::AttrUtils::GetBool(node->GetOpDesc(), "_copy_value", copy_value); + (void)ge::AttrUtils::GetBool(node->GetOpDesc(), "_copy_value", copy_value); // no need to check value if (!copy_value) { auto src_node = compute_graph->FindNode(cp_from_node); GE_CHECK_NOTNULL(src_node); @@ -2542,10 +3591,53 @@ Status DavinciModel::CopyVarData(ComputeGraphPtr &compute_graph) { auto ret = CopyTensorFromSrcVarNode(src_node, node); GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED); // only copy once - (void)ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); + (void)ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true); // no need to check value + } + } + } + return SUCCESS; +} + +Status DavinciModel::GetComputeGraphInfo(std::vector &compute_graph_desc_info) { + GELOGI("GetComputeGraphInfo start."); + if (compute_graph_ == nullptr) { + GELOGE(FAILED, "compute_graph_ is nullptr"); + return FAILED; + } + + for (auto &node : compute_graph_->GetAllNodes()) { + ComputeGraphDescInfo compute_graph_info; + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + GELOGE(PARAM_INVALID, "op_desc is nullptr."); + return PARAM_INVALID; + } + + auto op_mode = static_cast(domi::ImplyType::INVALID); + if (AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, op_mode) && + op_mode == static_cast(domi::ImplyType::TVM)) { + compute_graph_info.op_name = op_desc->GetName(); + compute_graph_info.op_type = op_desc->GetType(); + + for (size_t i = 0; i < op_desc->GetInputsSize(); ++i) { + GeTensorDesc input_desc = op_desc->GetInputDesc(i); + compute_graph_info.input_format.emplace_back(input_desc.GetFormat()); + compute_graph_info.input_shape.emplace_back(input_desc.GetShape().GetDims()); + compute_graph_info.input_data_type.emplace_back(input_desc.GetDataType()); } + + for (size_t j = 0; j < op_desc->GetOutputsSize(); ++j) { + GeTensorDesc output_desc = op_desc->GetOutputDesc(j); + compute_graph_info.output_format.emplace_back(output_desc.GetFormat()); + compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims()); + compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType()); + } + + compute_graph_desc_info.emplace_back(compute_graph_info); } } + GELOGI("GetComputeGraphInfo end."); return SUCCESS; } + } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/davinci_model.h b/src/ge/graph/load/new_model_manager/davinci_model.h index 4a674517..9ce02a42 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.h +++ b/src/ge/graph/load/new_model_manager/davinci_model.h @@ -25,30 +25,55 @@ #include #include "common/ge_types.h" -#include "common/types.h" -#include "graph/load/new_model_manager/data_inputer.h" -#include "graph/load/new_model_manager/model_utils.h" -#include "proto/task.pb.h" -#include "mmpa/mmpa_api.h" +#include "common/helper/model_helper.h" +#include "common/helper/om_file_helper.h" #include "graph/debug/ge_attr_define.h" #include "common/opskernel/ge_task_info.h" +#include "common/types.h" #include "framework/common/util.h" +#include "graph/load/new_model_manager/data_dumper.h" +#include "graph/load/new_model_manager/data_inputer.h" +#include "graph/load/new_model_manager/model_utils.h" #include "graph/model.h" +#include "graph/node.h" #include "graph/op_desc.h" #include "graph/operator.h" -#include "graph/utils/tensor_utils.h" -#include "common/helper/model_helper.h" -#include "common/helper/om_file_helper.h" -#include "graph/load/new_model_manager/data_dumper.h" -#include "graph/node.h" #include "graph/utils/attr_utils.h" +#include "graph/utils/tensor_utils.h" +#include "mmpa/mmpa_api.h" +#include "proto/task.pb.h" #include "task_info/task_info.h" #define WEIGHTS_ADDR_TO_CCE(var) namespace ge { using std::vector; -const uint32_t MEM_ALIGN_SIZE = 512; +enum ZeroCopyMode { + kInputZeroCopy, + kOutputZeroCopy, +}; + +typedef enum tagModelProcStage { + MODEL_LOAD_START = 1, + MODEL_LOAD_END, + MODEL_PRE_PROC_START, + MODEL_PRE_PROC_END, + MODEL_INFER_START, + MODEL_INFER_END, + MODEL_AFTER_PROC_START, + MODEL_AFTER_PROC_END, + MODEL_PROC_INVALID, +} ModelProcStage; + +struct timeInfo { + uint32_t modelId; + int64_t processBeginTime; + int64_t processEndTime; + int64_t inferenceBeginTime; + int64_t inferenceEndTime; + int64_t dumpBeginTime; + int64_t dumpEndTime; +}; // comments class DavinciModel { @@ -157,6 +182,9 @@ class DavinciModel { // get Event number uint32_t EventNum() const { return runtime_param_.event_num; } + // get Lable number + uint32_t LabelNum() const { return runtime_param_.label_num; } + // get batch number uint32_t BatchNum() const { return runtime_param_.batch_num; } @@ -204,6 +232,9 @@ class DavinciModel { Status DestroyThread(); + // Get Data Op. + const vector &GetDataList() const { return data_op_list_; } + // get Op map GetOpList() const { return op_list_; } @@ -222,8 +253,8 @@ class DavinciModel { } return nullptr; } - // get taskid to op name - const map &GetTaskIdOpName() const { return op_task_id_map_; } + // get task info for profiling + const std::vector &GetTaskDescInfo() const { return task_desc_info_; } // get updated task info list std::vector GetTaskList() { return task_list_; } @@ -271,6 +302,14 @@ class DavinciModel { Status GetInputOutputDescInfo(vector &input_desc, vector &output_desc, std::vector &inputFormats, std::vector &output_formats); + /// + /// @ingroup domi_ome + /// @brief Get dynamic batch_info + /// @param [out] batch_info + /// @return execute result + /// + Status GetDynamicBatchInfo(std::vector> &batch_info); + /// /// @ingroup domi_ome /// @brief Get model_id. @@ -308,10 +347,9 @@ class DavinciModel { /// Status CopyInputDataToModel(const std::vector &data, uint32_t data_op_index, bool device_data); - Status ReturnResult(uint32_t model_id, uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, - OutputData *output_data); + Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data); - Status ReturnNoOutput(uint32_t model_id, uint32_t data_id); + Status ReturnNoOutput(uint32_t data_id); /// /// @ingroup domi_ome @@ -319,7 +357,7 @@ class DavinciModel { /// @param [in] op_list model_id /// @return Status /// - Status DumpOpInputOutput(map &op_list, uint32_t model_id); + Status DumpOpInputOutput(); /// /// @ingroup domi_ome @@ -327,7 +365,7 @@ class DavinciModel { /// @param [in] dump_op model_id /// @return Status /// - Status DumpSingleOpInputOutput(const OpDescPtr &dump_op, uint32_t model_id); + Status DumpSingleOpInputOutput(const OpDescPtr &dump_op); Status ModelRunStart(); @@ -399,11 +437,28 @@ class DavinciModel { /// /// @ingroup domi_ome /// @brief Save outside address of Data or NetOutput used info for ZeroCopy. + /// @param [in] const OpDescPtr &op_desc: current op desc /// @param [in] const std::vector &outside_addrs: address of task /// @param [in] const void *args_offset: arguments address save the address. /// @return None. /// - void SetZeroCopyAddr(const std::vector &outside_addrs_, void *args_offset); + void SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector &outside_addrs_, void *args_offset); + + bool GetL1FusionEnableOption() { return is_l1_fusion_enable_; } + + void SetProfileTime(ModelProcStage stage, int64_t endTime = 0); + + int64_t GetLoadBeginTime() { return load_begin_time_; } + + int64_t GetLoadEndTime() { return load_end_time_; } + + Status SinkModelProfile(std::shared_ptr &model); + + Status SinkTimeProfile(const InputData ¤t_data); + + void SaveDumpTask(uint32_t task_id, const std::shared_ptr &op_desc, uintptr_t args) { + data_dumper_.SaveDumpTask(task_id, op_desc, args); + } DavinciModel &operator=(const DavinciModel &model) = delete; @@ -420,29 +475,97 @@ class DavinciModel { // input data manager DataInputer *data_inputer_; + int64_t load_begin_time_; + int64_t load_end_time_; + struct timeInfo time_info_; int32_t dataInputTid; /// /// @ingroup domi_ome - /// @brief Save Data and NetOutput address info for ZeroCopy. + /// @brief Save Data address info for ZeroCopy. /// @param [in] const std::vector &outside_addrs /// @return None. /// - void SetOutsideAddr(const std::vector &outside_addrs); - Status ModelZeroCopy(const InputData &input_data, OutputData &output_data); - Status ZeroCopyInput(const InputData &input_data); - Status ZeroCopyOutput(const OutputData &output_data); - Status ZeroCopyImpl(const void *src_addr, const DataBuffer &data_buf); + void SetInputOutsideAddr(const std::vector &outside_addrs); + + /// + /// @ingroup domi_ome + /// @brief Save NetOutput address info for ZeroCopy. + /// @param [in] const std::vector &outside_addrs + /// @return None. + /// + void SetOutputOutsideAddr(const std::vector &outside_addrs); + + /// + /// @ingroup ge + /// @brief Copy Check input size and model op size. + /// @param [in] const int64_t &input_size: input size. + /// @param [in] const int64_t &op_size: model op size. + /// @param [in] is_dynamic_input: dynamic batch input flag. + /// @return true if success + /// + bool CheckInputAndModelSize(const int64_t &input_size, const int64_t &op_size, bool is_dynamic_input); + + /// + /// @ingroup ge + /// @brief Copy Input/Output to model for direct use. + /// @param [in] const InputData &input_data: user input data info. + /// @param [in/out] OutputData &output_data: user output data info. + /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input + /// @return SUCCESS handle successfully / others handle failed + /// + Status CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic_input); + + /// + /// @ingroup ge + /// @brief Copy Data addr to model for direct use. + /// @param [in] const vector &addrs: model input memory addr list. + /// @param [in] const vector &sizes: model input memory size list. + /// @param [in] const std::vector &blobs: user input data list. + /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input + /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy + /// @param [in] string batch_label: batch label for multi-batch scenes + /// @return SUCCESS handle successfully / others handle failed + /// + Status ZeroCopyBlobs(const std::vector &addr_list, const std::vector &size_list, + const std::vector &blobs, bool is_dynamic_input, ZeroCopyMode zero_copy_mode, + string batch_label); + + /// + /// @ingroup ge + /// @brief Copy input addr to model for direct use. + /// @param [in] void *addr: model input memory addr. + /// @param [in] uint32_t size: model input memory size. + /// @param [in] const DataBuffer &data_buffer: user input data. + /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input + /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy + /// @param [in] string batch_label: batch label for multi-batch scenes + /// @return SUCCESS handle successfully / others handle failed + /// + Status ZeroCopyInputBlobs(void *addr, int64_t size, const DataBuffer &data_buffer, ZeroCopyMode zero_copy_mode, + string batch_label); + + /// + /// @ingroup ge + /// @brief Copy address to args_ space for direct use. + /// @param [in] const void *src_addr: source address of the Op. + /// @param [in] const void *dst_addr: destination address of user data. + /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy + /// @param [in] string batch_label: batch label for multi-batch scenes + /// @return SUCCESS handle successfully / others handle failed + /// + Status ZeroCopyImpl(const void *src_addr, const DataBuffer &data_buf, ZeroCopyMode zero_copy_mode, + string batch_label); Status CopyInputData(const InputData ¤t_data, bool device_data = false); Status CopyTransData(const std::vector &data, uint32_t data_index, uint32_t data_op_index, - const std::vector &outputs, uint32_t output_size); + const std::vector &outputs); Status CopyPlainData(const std::vector &data, uint32_t data_index, uint32_t data_op_index, - const std::vector &outputs, uint32_t output_size, rtMemcpyKind_t kind); + const std::vector &outputs, rtMemcpyKind_t kind); - Status CopyOutputData(uint32_t model_id, uint32_t data_id, OutputData &output_data); + Status CopyOutputData(uint32_t data_id, OutputData &output_data); Status CopyOutputDataToUser(OpDescPtr &op_desc, std::vector &blobs, uint32_t &data_index); @@ -472,6 +595,51 @@ class DavinciModel { void UnbindTaskSinkStream(); + void AddEndGraphToTaskList(); + + /// + /// @ingroup ge + /// @brief Travel all nodes and do some init. + /// @param [in] compute_graph: ComputeGraph to load. + /// @return Status + /// + Status InitNodes(const ComputeGraphPtr &compute_graph); + + /// + /// @ingroup ge + /// @brief Data Op Initialize. + /// @param [in] NodePtr: Data Op. + /// @param [in/out] data_op_index: NetOutput addr size info. + /// @param [in/out] input_data_info: Data index and addr info {index, {size, addr}}. + /// @return Status + /// + Status InitDataOp(const NodePtr &node, uint32_t &data_op_index, + std::map> &input_data_info); + + /// + /// @ingroup ge + /// @brief input zero copy node Initialize. + /// @param [in] NodePtr: Data Op. + /// @return Status + /// + Status InitInputZeroCopyNodes(const NodePtr &node); + + /// + /// @ingroup ge + /// @brief NetOutput Op Initialize. + /// @param [in] op_desc: NetOutput Op descriptor. + /// @return Status + /// + Status InitNetOutput(const OpDescPtr &op_desc); + + /// + /// @ingroup ge + /// @brief Make Input and Output addr for feature use. + /// @param [in] input_data_info: Data index and addr info {index, {size, addr}}. + /// @return Status + /// + Status CombineDataInfo(const std::map> &input_data_info); + /// /// @ingroup domi_ome /// @brief Constant Op Init. @@ -496,6 +664,34 @@ class DavinciModel { /// Status InitModelStream(rtStream_t stream, bool async_mode); + /// + /// @ingroup ge + /// @brief ACL, Load task list with queue entrance. + /// @return: 0 for success / others for fail + /// + Status LoadWithQueue(); + + /// + /// @ingroup ge + /// @brief ACL, Bind Data Op addr to input queue. + /// @return: 0 for success / others for fail + /// + Status BindInputQueue(); + + /// + /// @ingroup ge + /// @brief ACL, Bind NetOutput Op addr to output queue. + /// @return: 0 for success / others for fail + /// + Status BindOutputQueue(); + + /// + /// @ingroup ge + /// @brief ACL, Make active stream for S0. + /// @return: 0 for success / others for fail + /// + Status BindActiveStream(); + /// /// @ingroup domi_ome /// @brief insert active_stream_indication_ @@ -503,8 +699,57 @@ class DavinciModel { /// Status MarkActiveStream(const OpDescPtr &op_desc); + /// + /// @ingroup ge + /// @brief definiteness queue schedule, bind input queue to task. + /// @param [in] queue_id: input queue id from user. + /// @param [in] addr: Data Op output tensor address. + /// @param [in] size: Data Op output tensor size. + /// @return: 0 for success / others for fail + /// + Status CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t size); + + /// + /// @ingroup ge + /// @brief definiteness queue schedule, bind output queue to task. + /// @param [in] queue_id: output queue id from user. + /// @param [in] addr: NetOutput Op input tensor address. + /// @param [in] size: NetOutput Op input tensor size. + /// @return: 0 for success / others for fail + /// + Status CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t size); + + /// + /// @ingroup ge + /// @brief definiteness queue schedule, active original model stream. + /// @param [in] streams: streams will active by S0. + /// @return: 0 for success / others for fail + /// + Status CpuActiveStream(const std::vector &stream_list); + + /// + /// @ingroup ge + /// @brief definiteness queue schedule, wait for end graph. + /// @return: 0 for success / others for fail + /// + Status CpuWaitEndGraph(); + + /// + /// @ingroup ge + /// @brief definiteness queue schedule, repeat run model. + /// @return: 0 for success / others for fail + /// + Status CpuModelRepeat(); + void InitRuntimeParams(); + /// + /// @ingroup ge + /// @brief set ts device. + /// @return: 0 for success / others for fail + /// + Status SetTSDevice(); + void CheckHasHcomOp(); Status DoTaskSink(); @@ -517,10 +762,14 @@ class DavinciModel { Status CopyVarData(ComputeGraphPtr &graph); Status CopyTensorFromSrcVarNode(const NodePtr &var_src, const NodePtr &var_dst); + // get desc info of graph for profiling + Status GetComputeGraphInfo(vector &compute_graph_desc_info); + void SetDataDumperArgs(); bool is_model_has_inited_; uint32_t model_id_; + uint32_t runtime_model_id_; string name_; uint32_t version_; GeModelPtr ge_model_; @@ -534,10 +783,13 @@ class DavinciModel { vector variable_op_list_; - vector output_size_list_; + vector output_size_list_; // Init by NetOutput Input Tensor + vector output_addr_list_; // Init by NetOutput Input Tensor + vector input_size_list_; // Init by Data Output Tensor + vector input_addr_list_; // Init by Data Output Tensor // output op: save cce op actual needed memory size - vector output_memory_size_list_; + vector output_memory_size_list_; std::thread thread_id_; @@ -563,7 +815,12 @@ class DavinciModel { vector label_list_; std::mutex outside_addrs_mutex_; - std::map> outside_addrs_; + std::map> input_outside_addrs_; + std::map> output_outside_addrs_; + // {op_id, batch_label} + map zero_copy_op_id_batch_label_; + // {batch_label, addrs} + map> zero_copy_batch_label_addrs_; std::vector task_list_; // rt_moodel_handle @@ -574,8 +831,11 @@ class DavinciModel { bool is_inner_model_stream_; // ACL queue schedule, save queue ids for Init. - std::vector input_queue_ids_; - std::vector output_queue_ids_; + std::vector cpu_task_list_; + std::vector input_queue_ids_; // input queue ids created by caller. + std::vector output_queue_ids_; // output queue ids created by caller. + std::vector input_mbuf_list_; // input mbuf created by dequeue task. + std::vector output_mbuf_list_; // output mbuf created by dequeue task. // save input/output tensor descriptor in maps std::map data_op_input_tensor_desc_map_; @@ -597,22 +857,27 @@ class DavinciModel { std::set aicpu_streams_; std::set hcom_streams_; RuntimeParam runtime_param_; - TBEKernelStore tbekernel_store_; static std::mutex tvm_bin_mutex_; // lock for tvm maps. static std::set tvm_bin_kernel_; std::map used_tbe_handle_map_; - // for profiling + // for profiling task and graph info std::map op_name_map_; - std::map op_task_id_map_; + std::vector task_desc_info_; + ComputeGraphPtr compute_graph_; int64_t maxDumpOpNum_; // for data dump DataDumper data_dumper_; - + bool input_use_zero_copy_; + bool output_use_zero_copy_; uint64_t iterator_count_; + bool is_l1_fusion_enable_; + + uint32_t end_graph_id_; + OpDescPtr end_graph_op_; }; #define TIME_LOG_HEAD_FMT " OP_ID OP_NAME OP_TYPE ELAPSED TIME(ms)" diff --git a/src/ge/graph/load/new_model_manager/davinci_model_parser.cc b/src/ge/graph/load/new_model_manager/davinci_model_parser.cc index b0fbf8e4..b744f907 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model_parser.cc +++ b/src/ge/graph/load/new_model_manager/davinci_model_parser.cc @@ -82,7 +82,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelInfoParser(const Mo model_info.name = davinci_model->Name(); } catch (...) { - GELOGE(FAILED, "OM model parser failed, some exceptions occur !"); + DOMI_LOGE("OM model parser failed, some exceptions occur !"); GE_CHK_RT(rtDeviceReset(0)); return FAILED; } diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc index 3137d17b..a1fefff2 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.cc +++ b/src/ge/graph/load/new_model_manager/model_manager.cc @@ -169,8 +169,8 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_ } ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) { - std::vector v_aicpu_kernel; std::lock_guard lock(sess_ids_mutex_); + std::vector v_aicpu_kernel; std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id); if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) { v_aicpu_kernel = model_aicpu_kernel_.at(model_key); @@ -212,7 +212,7 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, shared_ptr & GE_CHK_STATUS_RET(SetDevice(static_cast(GetContext().DeviceId())), "Set device failed, model id:%u.", model_id); - + mmTimespec timespec = mmGetTickCount(); std::shared_ptr davinci_model = MakeShared(0, listener); if (davinci_model == nullptr) { GELOGE(FAILED, "davinci_model is nullptr"); @@ -239,6 +239,15 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, shared_ptr & InsertModel(model_id, davinci_model); GELOGI("Parse model %u success.", model_id); + + if (ProfilingManager::Instance().ProfilingOn()) { + davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 + + timespec.tv_nsec)); // 1000 ^ 3 converts second to nanosecond + davinci_model->SetProfileTime(MODEL_LOAD_END); + if (davinci_model->SinkModelProfile(davinci_model) != SUCCESS) { + GELOGW("Sink model profile failed."); + } + } } while (0); GE_CHK_RT(rtDeviceReset(static_cast(GetContext().DeviceId()))); @@ -350,19 +359,16 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vectorGetOpList()) { - auto op = item.second; + for (const auto &op : model->GetDataList()) { GE_CHECK_NOTNULL(op); - if (op->GetType() == DATA) { - GE_CHECK_GE(inputs.size(), 1); - GE_CHECK_GE(inputs.size() - 1, index); - - DataBuffer data; - data.data = inputs[index].data.data; - data.length = inputs[index].data.length; - input_data.blobs.push_back(data); - index++; - } + GE_CHECK_GE(inputs.size(), 1); + GE_CHECK_GE(inputs.size() - 1, index); + + DataBuffer data; + data.data = inputs[index].data.data; + data.length = inputs[index].data.length; + input_data.blobs.push_back(data); + index++; } CHECK_FALSE_EXEC(input_data.blobs.size() >= inputs.size(), @@ -610,6 +616,21 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vectorGetInputOutputDescInfo(input_desc, output_desc, inputFormats, outputFormats); } +/// +/// @ingroup ge +/// @brief Get dynamic batch_info +/// @param [in] model_id +/// @param [out] batch_info +/// @return execute result +/// +Status ModelManager::GetDynamicBatchInfo(const uint32_t model_id, std::vector> &batch_info) { + std::shared_ptr davinci_model = GetModel(model_id); + GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetDynamicBatchInfo Failed, Invalid Model ID %u !", + model_id); + + return davinci_model->GetDynamicBatchInfo(batch_info); +} + Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id, vector &input_desc, vector &output_desc, std::vector &inputFormats, @@ -624,10 +645,11 @@ Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id, Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model, shared_ptr listener, void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) { GE_CHK_BOOL_RET_STATUS(model.key.empty() || access(model.key.c_str(), F_OK) == 0, PARAM_INVALID, - "input key file path is not valid!"); + "input key file path is not valid, %s", strerror(errno)); GenModelId(&model_id); shared_ptr davinci_model = nullptr; + mmTimespec timespec = mmGetTickCount(); ModelHelper model_helper; Status ret = model_helper.LoadModel(model); @@ -661,6 +683,15 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model GELOGI("Parse model %u success.", model_id); + if (ProfilingManager::Instance().ProfilingOn()) { + davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 + + timespec.tv_nsec)); // 1000 ^ 3 converts second to nanosecond + davinci_model->SetProfileTime(MODEL_LOAD_END); + if (davinci_model->SinkModelProfile(davinci_model) != SUCCESS) { + GELOGW("Sink model profile failed."); + } + } + GE_IF_BOOL_EXEC(ret == SUCCESS, device_count++); return SUCCESS; } while (0); @@ -681,7 +712,7 @@ Status ModelManager::LoadModelWithQ(uint32_t &model_id, const ModelData &model_d const std::vector &input_queue_ids, const std::vector &output_queue_ids) { GE_CHK_BOOL_RET_STATUS(model_data.key.empty() || access(model_data.key.c_str(), F_OK) == 0, PARAM_INVALID, - "input key file path is not valid!"); + "input key file path is not valid, %s", strerror(errno)); ModelHelper model_helper; Status ret = model_helper.LoadModel(model_data); @@ -778,6 +809,11 @@ Status ModelManager::GetModelMemAndWeightSize(const ModelData &model, size_t &me ret = om_file_helper.Init(model_data, model_len); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "om file helperInit failed!"); + auto partition_table = reinterpret_cast(model_data); + if (partition_table->num == 1) { + GELOGE(FAILED, "om model is error,please use executable om model"); + return FAILED; + } ModelPartition task_partition; if (om_file_helper.GetModelPartition(ModelPartitionType::TASK_INFO, task_partition) != SUCCESS) { GELOGE(FAILED, "get task model partition failed."); diff --git a/src/ge/graph/load/new_model_manager/model_manager.h b/src/ge/graph/load/new_model_manager/model_manager.h index 3aca1605..fe511c24 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.h +++ b/src/ge/graph/load/new_model_manager/model_manager.h @@ -23,8 +23,8 @@ #include #include #include -#include #include +#include #include "cce/aicpu_engine_struct.h" #include "common/types.h" #include "common/ge_types.h" @@ -37,6 +37,7 @@ #include "ge/ge_api_types.h" namespace ge { + class DavinciModel; class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { @@ -175,6 +176,14 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { ge::Status GetInputOutputDescInfo(const uint32_t model_id, std::vector &input_desc, std::vector &output_desc, std::vector &inputFormats, std::vector &outputFormats); + /// + /// @ingroup ge + /// @brief Get dynamic batch_info + /// @param [in] model_id + /// @param [out] batch_info + /// @return execute result + /// + ge::Status GetDynamicBatchInfo(const uint32_t model_id, std::vector> &batch_info); /// /// @ingroup domi_ome diff --git a/src/ge/graph/load/new_model_manager/model_output.cc b/src/ge/graph/load/new_model_manager/model_output.cc index 24f520b3..affda08a 100644 --- a/src/ge/graph/load/new_model_manager/model_output.cc +++ b/src/ge/graph/load/new_model_manager/model_output.cc @@ -15,13 +15,12 @@ */ #include "graph/load/new_model_manager/model_output.h" - #include #include - #include "common/debug/log.h" #include "common/op/ge_op_utils.h" #include "graph/load/new_model_manager/davinci_model.h" + #include "graph/load/output/output.h" namespace ge { diff --git a/src/ge/graph/load/new_model_manager/model_utils.cc b/src/ge/graph/load/new_model_manager/model_utils.cc index c47e669c..360a537f 100644 --- a/src/ge/graph/load/new_model_manager/model_utils.cc +++ b/src/ge/graph/load/new_model_manager/model_utils.cc @@ -58,15 +58,18 @@ bool ModelUtils::IsOutput(ConstOpDescPtr op_desc) { /// bool ModelUtils::IsInputTensorNeedTrans(ConstOpDescPtr op_desc, size_t tensor_index) { GE_CHECK_NOTNULL_EXEC(op_desc, return false); - const auto &input_desc = op_desc->GetInputDesc(tensor_index); - const auto &output_desc = op_desc->GetOutputDesc(tensor_index); + const auto &input_desc = op_desc->MutableInputDesc(static_cast(tensor_index)); + const auto &output_desc = op_desc->MutableOutputDesc(static_cast(tensor_index)); + GE_CHECK_NOTNULL_EXEC(input_desc, return false); + GE_CHECK_NOTNULL_EXEC(output_desc, return false); - if ((output_desc.GetFormat() == FORMAT_NC1HWC0) && (output_desc.GetDataType() == DT_INT8)) { + if ((output_desc->GetFormat() == FORMAT_NC1HWC0) && (output_desc->GetDataType() == DT_INT8)) { // AIPP input, add attribute in data op to tag aipp return false; } - return (input_desc.GetFormat() != output_desc.GetFormat()) || (input_desc.GetDataType() != output_desc.GetDataType()); + return (input_desc->GetFormat() != output_desc->GetFormat()) || + (input_desc->GetDataType() != output_desc->GetDataType()); } /// @@ -74,8 +77,8 @@ bool ModelUtils::IsInputTensorNeedTrans(ConstOpDescPtr op_desc, size_t tensor_in /// @brief Get input size. /// @return vector /// -vector ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { - vector v_input_size; +vector ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { + vector v_input_size; GE_CHECK_NOTNULL_EXEC(op_desc, return v_input_size); const size_t inputs_size = op_desc->GetInputsSize(); const string op_type = op_desc->GetType(); @@ -84,17 +87,18 @@ vector ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { for (size_t i = 0; i < inputs_size; ++i) { if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) { // TBE: add weights size to input - GE_IF_BOOL_EXEC(true, GeTensorDesc tensor_desc = op_desc->GetInputDesc(i); uint32_t tensor_size = 0; - GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size)); - if (tensor_size) { v_input_size.push_back(tensor_size); }); + GE_IF_BOOL_EXEC( + true, GeTensorDesc tensor_desc = op_desc->GetInputDesc(i); int64_t tensor_size = 0; + GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size)); + if (tensor_size) { v_input_size.push_back(tensor_size); }); continue; } - uint32_t tensor_size = 0; + int64_t tensor_size = 0; GE_IF_BOOL_EXEC( - TensorUtils::GetSize(op_desc->GetInputDesc(i), tensor_size) != GRAPH_SUCCESS, - GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); - continue;); + TensorUtils::GetSize(op_desc->GetInputDesc(i), tensor_size) != GRAPH_SUCCESS, + GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); + continue;); v_input_size.push_back(tensor_size); } @@ -107,8 +111,8 @@ vector ModelUtils::GetInputSize(ConstOpDescPtr op_desc) { /// @brief Get output size. /// @return vector /// -vector ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) { - vector v_output_size; +vector ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) { + vector v_output_size; GE_CHECK_NOTNULL_EXEC(op_desc, return v_output_size); const size_t outputs_size = op_desc->GetOutputsSize(); @@ -118,11 +122,11 @@ vector ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) { return v_output_size;); for (size_t i = 0; i < outputs_size; ++i) { - uint32_t tensor_size = 0; + int64_t tensor_size = 0; GE_IF_BOOL_EXEC( - TensorUtils::GetSize(op_desc->GetOutputDesc(i), tensor_size) != GRAPH_SUCCESS, - GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i); - continue;); + TensorUtils::GetSize(op_desc->GetOutputDesc(i), tensor_size) != GRAPH_SUCCESS, + GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i); + continue;); v_output_size.push_back(tensor_size); } @@ -135,8 +139,8 @@ vector ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) { /// @brief Get workspace size. /// @return vector /// -vector ModelUtils::GetWorkspaceSize(ConstOpDescPtr op_desc) { - vector v_workspace_size; +vector ModelUtils::GetWorkspaceSize(ConstOpDescPtr op_desc) { + vector v_workspace_size; GE_CHECK_NOTNULL_EXEC(op_desc, return v_workspace_size); const vector v_workspace_num = op_desc->GetWorkspace(); @@ -158,8 +162,8 @@ vector ModelUtils::GetWorkspaceSize(ConstOpDescPtr op_desc) { /// @brief Get weight size. /// @return vector /// -vector ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) { - vector v_weight_size; +vector ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) { + vector v_weight_size; GE_CHECK_NOTNULL_EXEC(op_desc, return v_weight_size); // const op, get weight directly @@ -178,7 +182,9 @@ vector ModelUtils::GetWeightSize(ConstOpDescPtr op_desc) { const vector v_is_input_const = op_desc->GetIsInputConst(); for (size_t i = 0; i < inputs_size; ++i) { if ((i < v_is_input_const.size()) && v_is_input_const[i]) { - v_weight_size.push_back(TensorUtils::GetWeightSize(op_desc->GetInputDesc(i))); + int64_t tensor_size = 0; + (void)TensorUtils::GetSize(op_desc->GetInputDesc(i), tensor_size); + v_weight_size.push_back(tensor_size); } } @@ -226,8 +232,8 @@ vector ModelUtils::GetWeights(ConstOpDescPtr op_desc) { /// @brief Save Output tensor info to vector. /// @return Status /// -Status ModelUtils::GetOutputSize(ConstOpDescPtr op_desc, vector &output_size_list, - vector &output_memory_size_list) { +Status ModelUtils::GetOutputSize(ConstOpDescPtr op_desc, vector &output_size_list, + vector &output_memory_size_list) { GE_CHECK_NOTNULL(op_desc); for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) { @@ -238,8 +244,8 @@ Status ModelUtils::GetOutputSize(ConstOpDescPtr op_desc, vector &outpu if (output_tensor) { // get transferred parameters such as size - uint32_t size = 0; - uint32_t memory_size = 0; + int64_t size = 0; + int64_t memory_size = 0; graphStatus graph_status0 = TensorUtils::GetTensorSizeInBytes(output_desc, size); graphStatus graph_status1 = TensorUtils::GetTensorMemorySizeInBytes(output_desc, memory_size); if ((graph_status0 != GRAPH_SUCCESS) || (graph_status1 != GRAPH_SUCCESS)) { @@ -319,7 +325,7 @@ vector<::tagCcAICPUTensor> ModelUtils::GetOutputDescs(ConstOpDescPtr op_desc) { tmp.data_type = tagOpDataType(tmp_type); for (int32_t j = 0; j < 4; j++) { // 4 dims - tmp.dim[j] = static_cast(j < tmp.dim_cnt ? descriptor.GetShape().GetDim(j) : 1); + tmp.dim[j] = (j < tmp.dim_cnt ? descriptor.GetShape().GetDim(j) : 1); } v_output_descs.push_back(tmp); @@ -375,16 +381,24 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co size_t non_const_index = 0; const vector v_is_input_const = op_desc->GetIsInputConst(); + vector v_memory_type; + bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, v_memory_type); + if (has_mem_type_attr && (v_memory_type.size() != inputs_size)) { + GELOGE(PARAM_INVALID, "L1Fusion: check input size failed, op: %s, input v_memory_type size: %zu input numbers: %zu", + op_desc->GetName().c_str(), v_memory_type.size(), inputs_size); + return v_input_data_addr; + } for (size_t i = 0; i < inputs_size; ++i) { if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) { // TBE: add weights address to input - GE_IF_BOOL_EXEC(true, GeTensorDesc tensor_desc = op_desc->GetInputDesc(i); uint32_t tensor_size = 0; - GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size)); if (tensor_size) { - int64_t data_offset = 0; - GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, data_offset)); - uint8_t *weight_addr = static_cast(weight_base + data_offset - logic_weight_base); - v_input_data_addr.push_back(weight_addr); - }); + GE_IF_BOOL_EXEC( + true, GeTensorDesc tensor_desc = op_desc->GetInputDesc(i); int64_t tensor_size = 0; + GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size)); if (tensor_size) { + int64_t data_offset = 0; + GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, data_offset)); + uint8_t *weight_addr = static_cast(weight_base + data_offset - logic_weight_base); + v_input_data_addr.push_back(weight_addr); + }); non_const_index++; continue; } @@ -396,17 +410,23 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co int64_t input_offset = v_input_offset[non_const_index]; non_const_index++; GE_IF_BOOL_EXEC(var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset), - uint8_t *variable_addr = var_base + input_offset - logic_var_base; - v_input_data_addr.push_back(variable_addr); - continue;); + uint8_t *variable_addr = var_base + input_offset - logic_var_base; + v_input_data_addr.push_back(variable_addr); continue;); bool input_tensor = false; GE_IF_BOOL_EXEC(TensorUtils::GetInputTensor(op_desc->GetOutputDesc(i), input_tensor) != GRAPH_SUCCESS, GELOGW("get size from TensorDesc failed, op: %s, input index: %zu", op_desc->GetName().c_str(), i); continue;); - - uint8_t *mem_addr = mem_base + input_offset - logic_mem_base; - v_input_data_addr.push_back(mem_addr); + // feature maps + uint8_t *mem_addr = nullptr; + // l1 fusion + if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { + mem_addr = reinterpret_cast(input_offset); + v_input_data_addr.push_back(mem_addr); + } else { + mem_addr = static_cast(mem_base + input_offset - logic_mem_base); + v_input_data_addr.push_back(mem_addr); + } } return v_input_data_addr; @@ -448,16 +468,32 @@ vector ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C GE_IF_BOOL_EXEC(v_output_offset.size() != outputs_size, GELOGW("Output param invalid: output_offset=%zu, outputs=%zu.", v_output_offset.size(), outputs_size); return v_output_data_addr;); - + vector v_memory_type; + bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type); + if (has_mem_type_attr && (v_memory_type.size() != outputs_size)) { + GELOGE(PARAM_INVALID, + "L1Fusion: check output size failed, op: %s, output v_memory_type size: %lu output numbers: %zu", + op_desc->GetName().c_str(), v_memory_type.size(), outputs_size); + return v_output_data_addr; + } for (size_t i = 0; i < outputs_size; ++i) { GE_IF_BOOL_EXEC(var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(v_output_offset[i]), - uint8_t *variable_addr = static_cast(var_base + v_output_offset[i] - logic_var_base); - v_output_data_addr.push_back(variable_addr); - continue;); - uint8_t *mem_addr = mem_base + v_output_offset[i] - logic_mem_base; - v_output_data_addr.push_back(mem_addr); + uint8_t *variable_addr = static_cast(var_base + v_output_offset[i] - logic_var_base); + v_output_data_addr.push_back(variable_addr); + GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[V] name[%s] output[%zu] memaddr[%p]", + model_param.graph_id, op_desc->GetName().c_str(), i, variable_addr); + continue;); + // feature maps + uint8_t *mem_addr = nullptr; + // l1 fusion + if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { + mem_addr = reinterpret_cast(v_output_offset[i]); + v_output_data_addr.push_back(mem_addr); + } else { + mem_addr = static_cast(mem_base + v_output_offset[i] - logic_mem_base); + v_output_data_addr.push_back(mem_addr); + } } - return v_output_data_addr; } @@ -466,30 +502,43 @@ vector ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C /// @brief Get workspace data address. /// @return vector /// -vector ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc) { +vector ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc, + bool need_convert) { vector v_workspace_data_addr; GE_CHECK_NOTNULL_EXEC(op_desc, return v_workspace_data_addr); uint8_t *mem_base = model_param.mem_base; uint64_t mem_size = model_param.mem_size; - Status status = ConvertVirtualAddressToPhysical(mem_base, mem_size, mem_base); - if (status != SUCCESS) { - GELOGE(RT_FAILED, "Convert virtual address to physical for mem_base failed."); - return v_workspace_data_addr; + if (need_convert) { + Status status = ConvertVirtualAddressToPhysical(mem_base, mem_size, mem_base); + if (status != SUCCESS) { + GELOGE(RT_FAILED, "Convert virtual address to physical for mem_base failed."); + return v_workspace_data_addr; + } } - const vector v_workspace_num = op_desc->GetWorkspace(); + const vector v_workspace_offset = op_desc->GetWorkspace(); const vector v_workspace_bytes = op_desc->GetWorkspaceBytes(); - if (v_workspace_num.size() != v_workspace_bytes.size()) { - GELOGW("v_workspace_num.size()[%zu] != v_workspace_bytes.size()[%zu]", v_workspace_num.size(), + if (v_workspace_offset.size() != v_workspace_bytes.size()) { + GELOGW("v_workspace_offset.size()[%zu] != v_workspace_bytes.size()[%zu]", v_workspace_offset.size(), v_workspace_bytes.size()); return v_workspace_data_addr; } - + vector v_memory_type; + bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, v_memory_type); for (size_t i = 0; i < v_workspace_bytes.size(); ++i) { - int64_t workspace_num = v_workspace_num[i]; - int64_t workspace_bytes = v_workspace_bytes[i]; - v_workspace_data_addr.push_back(workspace_bytes == 0 ? nullptr : mem_base + workspace_num); + if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { + v_workspace_data_addr.push_back(reinterpret_cast(v_workspace_offset[i])); + GELOGI("L1Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i, + reinterpret_cast(v_workspace_offset[i])); + } else { + int64_t workspace_offset = v_workspace_offset[i]; + int64_t workspace_bytes = v_workspace_bytes[i]; + uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : mem_base + workspace_offset; + v_workspace_data_addr.push_back(mem_addr); + GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] output[%zu] offset[%ld] bytes[%ld] memaddr[%p]", + model_param.graph_id, op_desc->GetName().c_str(), i, workspace_offset, workspace_bytes, mem_addr); + } } return v_workspace_data_addr; diff --git a/src/ge/graph/load/new_model_manager/model_utils.h b/src/ge/graph/load/new_model_manager/model_utils.h index 950fdbe8..1a15c930 100644 --- a/src/ge/graph/load/new_model_manager/model_utils.h +++ b/src/ge/graph/load/new_model_manager/model_utils.h @@ -52,28 +52,28 @@ class ModelUtils { /// @brief Get input size. /// @return vector /// - static vector GetInputSize(ConstOpDescPtr op_desc); + static vector GetInputSize(ConstOpDescPtr op_desc); /// /// @ingroup domi_ome /// @brief Get output size. /// @return vector /// - static vector GetOutputSize(ConstOpDescPtr op_desc); + static vector GetOutputSize(ConstOpDescPtr op_desc); /// /// @ingroup domi_ome /// @brief Get workspace size. /// @return vector /// - static vector GetWorkspaceSize(ConstOpDescPtr op_desc); + static vector GetWorkspaceSize(ConstOpDescPtr op_desc); /// /// @ingroup domi_ome /// @brief Get weight size. /// @return vector /// - static vector GetWeightSize(ConstOpDescPtr op_desc); + static vector GetWeightSize(ConstOpDescPtr op_desc); /// /// @ingroup domi_ome @@ -87,8 +87,8 @@ class ModelUtils { /// @brief Save Output tensor info to vector. /// @return Status /// - static Status GetOutputSize(ConstOpDescPtr op_desc, vector &output_size_list, - vector &output_memory_size_list); + static Status GetOutputSize(ConstOpDescPtr op_desc, vector &output_size_list, + vector &output_memory_size_list); /// /// @ingroup domi_ome @@ -123,7 +123,8 @@ class ModelUtils { /// @brief Get workspace data address. /// @return vector /// - static vector GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc); + static vector GetWorkspaceDataAddrs(const RuntimeParam &model_param, ConstOpDescPtr op_desc, + bool need_convert = true); static ge::Status ConvertVirtualAddressToPhysical(uint8_t *virtual_address, uint64_t size, uint8_t *&physical_address); diff --git a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc index f14f593e..cb30092c 100644 --- a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc @@ -16,17 +16,21 @@ #include "graph/load/new_model_manager/task_info/end_graph_task_info.h" +#include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" #include "graph/load/new_model_manager/davinci_model.h" +namespace { +const uint32_t kDumpFlag = 2; +} // namespace namespace ge { Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { - GELOGI("InitEndGraphTaskInfo start."); + GELOGI("InitEndGraphTaskInfo Init Start."); if (davinci_model == nullptr) { GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } - + davinci_model_ = davinci_model; Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList()); if (ret != SUCCESS) { GELOGE(ret, "SetStream fail, stream_id:%u", task_def.stream_id()); @@ -34,21 +38,42 @@ Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin } model_ = davinci_model->GetRtModelHandle(); - return SUCCESS; } Status EndGraphTaskInfo::Distribute() { GELOGI("EndGraphTaskInfo Distribute Start."); + auto all_dump_model = PropertiesManager::Instance().GetAllDumpModel(); + if (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end() || + all_dump_model.find(davinci_model_->Name()) != all_dump_model.end()) { + GELOGI("Start to call rtEndGraphEx"); + rtError_t rt_ret = rtEndGraphEx(model_, stream_, kDumpFlag); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rtEndGraphEx failed, ret: 0x%x", rt_ret); + return RT_FAILED; + } + } else { + GELOGI("Start to call rtEndGraph"); + rtError_t rt_ret = rtEndGraph(model_, stream_); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rtEndGraph failed, ret: 0x%x", rt_ret); + return RT_FAILED; + } + } - rtError_t rt_ret = rtEndGraph(model_, stream_); + uint32_t task_id = 0; + GE_CHECK_NOTNULL(davinci_model_); + rtError_t rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id); if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rtEndGraph failed, ret: 0x%x", rt_ret); + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } + task_id_ = task_id; + GELOGI("EndGraphTaskInfo Distribute Success, task id is %u", task_id); return SUCCESS; } REGISTER_TASK_INFO(RT_MODEL_TASK_MODEL_END_GRAPH, EndGraphTaskInfo); + } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.h b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.h index 17f3b002..1c039172 100644 --- a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.h @@ -29,8 +29,13 @@ class EndGraphTaskInfo : public TaskInfo { Status Distribute() override; + uint32_t GetTaskID() override { return task_id_; } + private: rtModel_t model_; + DavinciModel *davinci_model_; + uint32_t task_id_; }; + } // namespace ge #endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_END_GRAPH_TASK_INFO_H_ diff --git a/src/ge/graph/load/new_model_manager/task_info/fusion_start_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/fusion_start_task_info.cc index 3463b41b..f3fa7959 100644 --- a/src/ge/graph/load/new_model_manager/task_info/fusion_start_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/fusion_start_task_info.cc @@ -43,6 +43,7 @@ Status FusionStartTaskInfo::Distribute() { return RT_FAILED; } + GELOGI("FusionStartTaskInfo Distribute Success."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/fusion_stop_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/fusion_stop_task_info.cc index 27d7f345..128fb325 100644 --- a/src/ge/graph/load/new_model_manager/task_info/fusion_stop_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/fusion_stop_task_info.cc @@ -43,6 +43,7 @@ Status FusionStopTaskInfo::Distribute() { return RT_FAILED; } + GELOGI("FusionStopTaskInfo Distribute Success."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc index 3facd504..52511f03 100644 --- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc @@ -92,13 +92,13 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m auto workspace_bytes = op_desc->GetWorkspaceBytes(); if (!workspace_bytes.empty()) { uint64_t workspace_mem_size_tmp = workspace_bytes[0]; - GELOGI("hccl need work_space_mem_size=%lu", workspace_mem_size_tmp); + GELOGI("hccl need workSpaceMemSize=%lu", workspace_mem_size_tmp); if (workspace_mem_size_tmp != 0) { workspace_mem_size_ = workspace_mem_size_tmp; vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc); if (!workspace_data_addrs.empty()) { - GELOGI("Get work_space_addr"); + GELOGI("Get workSpaceAddr"); workspace_addr_ = workspace_data_addrs[0]; } } @@ -106,11 +106,9 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m // GE's new process: hccl declares the number of streams required, creates a stream by GE, and sends it to hccl int64_t hccl_stream_num = 0; if (!ge::AttrUtils::GetInt(op_desc, "used_stream_num", hccl_stream_num)) { - GELOGW("op_desc has no attr used_stream_num!"); + GELOGI("op_desc has no attr used_stream_num!"); } - GELOGI("hcclStreamNum =%ld", hccl_stream_num); - for (int64_t i = 0; i < hccl_stream_num; ++i) { rtStream_t stream = nullptr; rtError_t rt_ret = @@ -131,6 +129,7 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m davinci_model->PushHcclStream(stream); } + GELOGI("HcclTaskInfo Init Success, hcclStreamNum =%ld", hccl_stream_num); return SUCCESS; } @@ -151,7 +150,7 @@ Status HcclTaskInfo::Distribute() { return INTERNAL_ERROR; } - GELOGI("Call function LoadTask end."); + GELOGI("HcclTaskInfo Distribute Success."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc index cfee3610..88e8a1bb 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc @@ -28,7 +28,6 @@ #include "graph/load/new_model_manager/model_manager.h" namespace ge { - Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { GELOGI("KernelExTaskInfo Init Start."); if (davinci_model == nullptr) { @@ -51,15 +50,15 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin return INTERNAL_ERROR; } + if (CopyTaskInfo(kernel_ex_def, davinci_model->GetRuntimeParam(), op_desc) != SUCCESS) { + GELOGE(FAILED, "copy task info to workspace failed."); + return FAILED; + } + vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc); if (workspace_data_addrs.empty()) { GELOGE(FAILED, "workspace_data_addrs is empty."); return FAILED; - } else { - rtError_t rt_ret = - rtMemcpy(workspace_data_addrs[0], kernel_ex_def.task_info_size(), kernel_ex_def.task_info().data(), - kernel_ex_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(FAILED, "rtMemcpy error: 0x%X", rt_ret); return FAILED); } // 2. Reconstruct kernelExDef.args to STR_FWK_OP_KERNEL @@ -117,10 +116,9 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin // 4. Create session auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID; - GELOGI("session_id: %lu", session_id); GE_CHECK_NOTNULL(ModelManager::GetInstance()); GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS, - GELOGE(FAILED, "CreateAicpuSession error."); + GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id); return FAILED;) // 4.1 Collect aicpu kernel uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID; @@ -134,10 +132,44 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin rt_ret = rtMemcpy(kernel_buf_, sizeof(STR_FWK_OP_KERNEL), static_cast(&fwk_op_kernel), sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;) - davinci_model->SetZeroCopyAddr(io_addrs, input_output_addr_); + davinci_model->SetZeroCopyAddr(op_desc, io_addrs, input_output_addr_); kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL); davinci_model_ = davinci_model; + + GELOGI("KernelExTaskInfo Init Success. session id: %lu", session_id); + return SUCCESS; +} + +Status KernelExTaskInfo::CopyTaskInfo(const domi::KernelExDef &kernel_def, const RuntimeParam &rts_param, + const OpDescPtr &op_desc) { + // Userspace copy need virtual address. + const vector workspace_data_sizes = ModelUtils::GetWorkspaceSize(op_desc); + const vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc, false); + if (workspace_data_addrs.empty() || workspace_data_sizes.empty()) { + GELOGE(FAILED, "Node:%s invalid workspace, addrs is %zu, size is %zu.", op_desc->GetName().c_str(), + workspace_data_addrs.size(), workspace_data_sizes.size()); + return FAILED; + } + + if (workspace_data_addrs[0] == nullptr) { + GELOGE(FAILED, "Node:%s workspace addrs is null.", op_desc->GetName().c_str()); + return FAILED; + } + + if (workspace_data_sizes[0] < static_cast(kernel_def.task_info_size())) { + GELOGE(FAILED, "Node:%s workspace size is %zu, task info size is %zu.", op_desc->GetName().c_str(), + workspace_data_sizes[0], kernel_def.task_info_size()); + return FAILED; + } + + rtError_t rt_ret = rtMemcpy(workspace_data_addrs[0], kernel_def.task_info_size(), kernel_def.task_info().data(), + kernel_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(FAILED, "rtMemcpy error: 0x%X", rt_ret); + return FAILED; + } + return SUCCESS; } @@ -154,14 +186,17 @@ Status KernelExTaskInfo::Distribute() { return PARAM_INVALID; } - uint32_t taskid = 0; - rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &taskid); + uint32_t task_id = 0; + uint32_t stream_id = UINT32_MAX; // default value, wait for rts + rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id); if (rt_ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } - task_id_ = taskid; + task_id_ = task_id; + stream_id_ = stream_id; + GELOGI("KernelExTaskInfo Distribute Success. task id: %u", task_id_); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h index a1fd541f..9aab55e7 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h @@ -18,6 +18,7 @@ #define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_KERNEL_EX_TASK_INFO_H_ #include "graph/load/new_model_manager/task_info/task_info.h" +#include "graph/op_desc.h" namespace ge { class KernelExTaskInfo : public TaskInfo { @@ -41,13 +42,18 @@ class KernelExTaskInfo : public TaskInfo { uint32_t GetTaskID() override { return task_id_; } + uint32_t GetStreamId() override { return stream_id_; } + uintptr_t GetDumpArgs() override { - auto ret = reinterpret_cast(dump_args_); + auto ret = reinterpret_cast(dump_args_); return ret; } private: + Status CopyTaskInfo(const domi::KernelExDef &kernel_def, const RuntimeParam &rts_param, const OpDescPtr &op_desc); + uint32_t task_id_; + uint32_t stream_id_; uint32_t dump_flag_; uint32_t kernel_buf_size_; DavinciModel *davinci_model_; diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc index 5b3877c8..407efd63 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc @@ -26,21 +26,36 @@ #include "framework/common/l2_cache_optimize.h" #include "graph/load/new_model_manager/davinci_model.h" #include "graph/load/new_model_manager/model_utils.h" +#include "graph/debug/ge_attr_define.h" #include "runtime/kernel.h" +#include "graph/debug/ge_attr_define.h" +#include "super_kernel/super_kernel_factory.h" +#include "super_kernel/super_kernel.h" namespace { const uint8_t kL2LoadToDdr = 1; const uint8_t kL2NotLoadToDdr = 0; +// for skt +constexpr int64_t kInvalidGroupKey = -1; +constexpr uint32_t kSKTSingleSize = 1; +constexpr uint32_t kSKTMaxSizeLimit = 20000; +const char *kIsLastNode = "is_last_node"; +const char *kIsFirstNode = "is_first_node"; +const int64_t kCloseSkt = 100; } // namespace namespace ge { +KernelTaskInfo::SuperKernelTaskInfo KernelTaskInfo::skt_info_ = { + 0, 0, 0, nullptr, nullptr, {}, {}, RT_KERNEL_DEFAULT, kInvalidGroupKey, 0, nullptr}; + Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { - GELOGD("KernelTaskInfo Init Start."); if (davinci_model == nullptr) { GELOGE(PARAM_INVALID, "davinci_model is null!"); return PARAM_INVALID; } davinci_model_ = davinci_model; + is_l1_fusion_enable_ = davinci_model_->GetL1FusionEnableOption(); + GELOGD("KernelTaskInfo Init Start, ge.enableL1Fusion in davinci model is %d.", is_l1_fusion_enable_); Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList()); if (ret != SUCCESS) { @@ -54,15 +69,29 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci const domi::KernelContext &context = kernel_def.context(); // get kernel_type kernel_type_ = static_cast(context.kernel_type()); - // get bin_file_key - OpDescPtr op_desc = davinci_model->GetOpByIndex(context.op_index()); - if (op_desc == nullptr) { + // get opdesc + op_desc_ = davinci_model->GetOpByIndex(context.op_index()); + if (op_desc_ == nullptr) { GELOGE(INTERNAL_ERROR, "Get op_desc failed, index is out of range!"); return INTERNAL_ERROR; } + (void)AttrUtils::GetBool(*op_desc_, ATTR_N_BATCH_SPILT, is_n_batch_spilt_); + GELOGD("node[%s] is_n_batch_spilt %d", op_desc_->GetName().c_str(), is_n_batch_spilt_); + (void)AttrUtils::GetInt(*op_desc_, ATTR_NAME_L1_FUSION_GROUP_KEY, group_key_); + has_group_key_ = (group_key_ != kInvalidGroupKey); + GELOGD("node[%s] has_group_key_ %ld, group key is [%ld]", op_desc_->GetName().c_str(), has_group_key_, group_key_); + + // fusion_op_info + vector original_op_names; + bool result = AttrUtils::GetListStr(op_desc_, ge::ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names); + GE_IF_BOOL_EXEC(result, fusion_op_info_.stream_id = task_def.stream_id(); + fusion_op_info_.op_index = context.op_index(); fusion_op_info_.original_op_names = original_op_names; + fusion_op_info_.op_name = op_desc_->GetName()); + string session_graph_model_id; - davinci_model->GetUniqueId(op_desc, session_graph_model_id); - const char *bin_file_key = DavinciModel::GetRegisterStub(op_desc->GetName(), session_graph_model_id); + davinci_model->GetUniqueId(op_desc_, session_graph_model_id); + // get bin_file_key + const char *bin_file_key = DavinciModel::GetRegisterStub(op_desc_->GetName(), session_graph_model_id); // new aicpu kernel(rtCpuKernelLaunch) no need to check function if (kernel_type_ == cce::ccKernelType::CCE_AI_CORE) { rtError_t rt_ret; @@ -107,39 +136,242 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci } ret = InitCceTask(davinci_model, kernel_def); } - GELOGD("KernelTaskInfo Init end."); + GELOGD("KernelTaskInfo Init finish, result=%u.", ret); return ret; } -Status KernelTaskInfo::Distribute() { - GELOGD("KernelTaskInfo Distribute Start."); +Status KernelTaskInfo::SaveSKTDumpInfo() { + GE_CHECK_NOTNULL(davinci_model_); + davinci_model_->SaveDumpTask(skt_info_.last_task_id, skt_info_.last_op, skt_info_.last_dump_args); + return SUCCESS; +} + +void KernelTaskInfo::UpdateSKTTaskId() { + uint32_t task_id = 0; + if (davinci_model_ != nullptr) { + rtError_t rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return; + } + skt_info_.last_task_id = task_id; + skt_id_ = skt_info_.last_task_id; + GELOGI("UpdateTaskId:UpdateSKTTaskId [%u]", task_id); + } +} + +void KernelTaskInfo::UpdateTaskId() { + uint32_t task_id = 0; + uint32_t stream_id = UINT32_MAX; // default value, wait for rts + if (davinci_model_ != nullptr) { + rtError_t rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &task_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return; + } + task_id_ = task_id; + stream_id_ = stream_id; + GELOGI("UpdateTaskId:UpdateTaskId [%u]", task_id); + } +} + +Status KernelTaskInfo::SKTFinalize() { + UpdateSKTTaskId(); + GE_CHK_STATUS_RET(SaveSKTDumpInfo(), "skt save dump info failed"); + GELOGI("SuperKernel Distribute [skt_id:%u]", skt_id_); + skt_info_.kernel_list.clear(); + skt_info_.arg_list.clear(); + skt_info_.last_stream = nullptr; + skt_info_.last_block_dim = 0; + skt_info_.last_sm_desc = sm_desc_; + skt_info_.last_group_key = kInvalidGroupKey; + skt_info_.last_dump_flag = RT_KERNEL_DEFAULT; + skt_info_.last_dump_args = 0; + skt_info_.last_op = nullptr; + return SUCCESS; +} + +Status KernelTaskInfo::SuperKernelLaunch() { + if (skt_info_.kernel_list.empty()) { + GELOGI("SuperKernelLaunch: Skt_kernel_list has no task, just return"); + return SUCCESS; + } rtError_t rt_ret; + auto &skt_kernel_list = skt_info_.kernel_list; + auto &skt_arg_list = skt_info_.arg_list; + GELOGI("SuperKernelLaunch: Skt_kernel_list size[%d] skt_arg_list[%d]", skt_kernel_list.size(), skt_arg_list.size()); + if (skt_kernel_list.size() == kSKTSingleSize) { + rt_ret = rtKernelLaunchWithFlag(skt_info_.kernel_list[0], static_cast(skt_info_.last_block_dim), + skt_info_.arg_list[0], skt_info_.last_args_size, + static_cast(skt_info_.last_sm_desc), skt_info_.last_stream, + skt_info_.last_dump_flag); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "SuperKernelLaunch: Call rt api failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } + GE_CHK_STATUS_RET(SKTFinalize(), "Skt finalize failed"); + return SUCCESS; + } + // Create super kernel factory + skt::SuperKernelFactory *factory = &skt::SuperKernelFactory::GetInstance(); + // Init super kernel factory + if (factory->Init() != SUCCESS) { + GELOGE(RT_FAILED, "SuperKernelLaunch: SuperKernelFactory init failed"); + return RT_FAILED; + } + // Call the fuse API + skt::SuperKernel *superKernel; + if (factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info_.last_block_dim, superKernel) != SUCCESS) { + GELOGE(RT_FAILED, "SuperKernelLaunch: fuse call failed"); + return RT_FAILED; + } + // Launch a super kernel + if (superKernel->Launch(skt_info_.last_stream, true) != SUCCESS) { + GELOGE(RT_FAILED, "SuperKernelLaunch: launch failed"); + return RT_FAILED; + } + GELOGI("SuperKernelLaunch: success[skt_kernel_list size[%zu] skt_arg_list[%zu]]", skt_kernel_list.size(), + skt_arg_list.size()); + GE_CHK_STATUS_RET(SKTFinalize(), "Skt finalize failed"); + return SUCCESS; +} + +Status KernelTaskInfo::SaveSuperKernelInfo() { + skt_info_.kernel_list.push_back(stub_func_); + skt_info_.arg_list.push_back(args_); + skt_info_.last_stream = stream_; + skt_info_.last_block_dim = block_dim_; + skt_info_.last_args_size = args_size_; + skt_info_.last_sm_desc = sm_desc_; + skt_info_.last_dump_flag = dump_flag_; + skt_info_.last_group_key = group_key_; + skt_info_.last_dump_args = reinterpret_cast(dump_args_); + skt_info_.last_op = op_desc_; + // last node in a stream, just launch + if (IsMarkedLastNode()) { + return SuperKernelLaunch(); + } + return SUCCESS; +} + +bool KernelTaskInfo::IsMarkedLastNode() { + if (davinci_model_ == nullptr) { + GELOGE(PARAM_INVALID, "davinci_model is null!"); + return false; + } + OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex); + if (op_desc == nullptr) { + GELOGE(INTERNAL_ERROR, "InitTVMTaskInfo error, index is out of range!"); + return false; + } + bool is_last_node = false; + (void)AttrUtils::GetBool(*op_desc, kIsLastNode, is_last_node); + return is_last_node; +} +bool KernelTaskInfo::IsMarkedFirstNode() { + if (davinci_model_ == nullptr) { + GELOGE(PARAM_INVALID, "davinci_model is null!"); + return false; + } + OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex); + if (op_desc == nullptr) { + GELOGE(INTERNAL_ERROR, "InitTVMTaskInfo error, index is out of range!"); + return false; + } + bool is_first_node = false; + (void)AttrUtils::GetBool(*op_desc, kIsFirstNode, is_first_node); + return is_first_node; +} +// current task 's block dim and stream and grouo key (if have) must same with last task, +// then may be saved to skt task list; else +// call skt launch those saved tasks before +bool KernelTaskInfo::FirstCallSKTLaunchCheck() { + return ((block_dim_ != skt_info_.last_block_dim) || (stream_ != skt_info_.last_stream) || + (has_group_key_ && (group_key_ != skt_info_.last_group_key))); +} + +// current task has group_id or has n ATTR_N_BATCH_SPLIT then save it to skt task list; else +// call skt launch those saved tasks and call rtlaunch for current task +bool KernelTaskInfo::DoubleCallSKTSaveCheck() { return (!is_n_batch_spilt_ && !has_group_key_); } + +Status KernelTaskInfo::SuperKernelDistribute() { + Status ret; + char *skt_task_num = getenv("SKT_TASK_NUM"); + auto task_num = static_cast((skt_task_num != nullptr) ? strtol(skt_task_num, nullptr, 10) + : kSKTMaxSizeLimit); // 10 for decimal number + GELOGI("SKT: SuperKernel Distribute Task num[skt_id:%lu]", task_num); + if (FirstCallSKTLaunchCheck()) { + ret = SuperKernelLaunch(); + if (ret != SUCCESS) { + GELOGE(FAILED, "Call SuperKernelLaunch failed!"); + return FAILED; + } + } + if (DoubleCallSKTSaveCheck()) { + // 1.launch before + ret = SuperKernelLaunch(); + if (ret != SUCCESS) { + GELOGE(FAILED, "Call SuperKernelLaunch failed!"); + return FAILED; + } + // 2.launch current + rtError_t rt_ret = rtKernelLaunchWithFlag(stub_func_, block_dim_, args_, args_size_, + static_cast(sm_desc_), stream_, dump_flag_); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return FAILED; + } + UpdateTaskId(); + GELOGI("Current Common Task Distribute [taskid:%u]", task_id_); + } else { + ret = SaveSuperKernelInfo(); + if (ret != SUCCESS) { + GELOGE(FAILED, "Call SuperKernelLaunch failed!"); + return FAILED; + } + GELOGI("Save Current task [block_dim:%u, size:%zu].", block_dim_, skt_info_.kernel_list.size()); + } + return SUCCESS; +} + +Status KernelTaskInfo::Distribute() { + GELOGD("KernelTaskInfo Distribute Start."); + rtError_t rt_ret = RT_ERROR_NONE; + char *skt_enable_env = getenv("SKT_ENABLE"); + int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0; if (kernel_type_ == cce::ccKernelType::AI_CPU) { // blockDim is reserved parameter, set to 1 rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast(so_name_.c_str()), reinterpret_cast(kernel_name_.c_str()), 1, args_, args_size_, nullptr, stream_, dump_flag_); } else { - rt_ret = rtKernelLaunchWithFlag(stub_func_, block_dim_, args_, args_size_, static_cast(sm_desc_), - stream_, dump_flag_); - } - - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); - return RT_FAILED; + /* default: not skt launch */ + bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_); + GELOGI( + "KernelTaskInfo Distribute Start, sktenable:%ld taskid:%u sktid:%u last_sktid:%u stubfunc_name:%s " + "stubfunc:%p blockdim:%u stream:%p", + env_flag, task_id_, skt_id_, skt_info_.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); + // l1 fusion enable and env flag open (kCloseSkt for skt debug) + if (call_skt && (env_flag != kCloseSkt)) { + GE_RETURN_IF_ERROR(SuperKernelDistribute()); + } else { + // call rtKernelLaunch for current task + rt_ret = rtKernelLaunchWithFlag(stub_func_, block_dim_, args_, args_size_, static_cast(sm_desc_), + stream_, dump_flag_); + } } - - uint32_t taskid = 0; - GE_CHECK_NOTNULL(davinci_model_); - rt_ret = rtModelGetTaskId(davinci_model_->GetRtModelHandle(), &taskid); if (rt_ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } - task_id_ = taskid; - + // set for task_id_ + UpdateTaskId(); + GELOGI( + "KernelTaskInfo Distribute Success. sktenable:%d taskid:%d sktid:%d stubfunc_name:%s stubfunc:%p " + "blockdim:%d stream:%p", + env_flag, task_id_, skt_id_, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); return SUCCESS; } @@ -168,7 +400,7 @@ Status KernelTaskInfo::Release() { } Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset, const domi::KernelDef &kernel_def) { - GELOGD("Do InitTVMTask"); + GELOGD("Do InitTVMTask."); GE_CHECK_NOTNULL(davinci_model); // get tvm op desc OpDescPtr op_desc = davinci_model->GetOpByIndex(ctx_.opIndex); @@ -236,7 +468,7 @@ Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset, reinterpret_cast(reinterpret_cast(args_) + offset + sizeof(void *) * input_data_addrs.size()); } - davinci_model_->SetZeroCopyAddr(tensor_device_addrs, static_cast(args_) + offset); + davinci_model_->SetZeroCopyAddr(op_desc, tensor_device_addrs, static_cast(args_) + offset); // update origin l2 data string sm_desc = kernel_def.sm_desc(); char *sm_contrl = nullptr; @@ -253,6 +485,7 @@ Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset, for (uint32_t data_index = 0; data_index < l2_ctrl_info_data_count; ++data_index) { if (l2_ctrl_info->data[data_index].L2_mirror_addr != 0) { l2_ctrl_info->data[data_index].L2_mirror_addr += data_base_addr; + l2_ctrl_info->data[data_index].L2_load_to_ddr = IsL2CpToDDR(l2_ctrl_info->data[data_index].L2_load_to_ddr); } } @@ -308,6 +541,7 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::mapSetZeroCopyAddr(input_data_addrs, custom_info_.input_addrs); - davinci_model_->SetZeroCopyAddr(output_data_addrs, custom_info_.output_addrs); + davinci_model_->SetZeroCopyAddr(op_desc, input_data_addrs, custom_info_.input_addrs); + davinci_model_->SetZeroCopyAddr(op_desc, output_data_addrs, custom_info_.output_addrs); return SUCCESS; } @@ -431,6 +665,7 @@ Status KernelTaskInfo::InitCceTask(DavinciModel *davinci_model, const domi::Kern GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "cce task physical memory.", kernel_def.args_size()) rt_ret = rtMemcpy(args_, kernel_def.args_size(), kernel_def.args().data(), kernel_def.args_size(), RT_MEMCPY_HOST_TO_DEVICE); @@ -470,6 +705,7 @@ Status KernelTaskInfo::InitAicpuTask(const std::map &op_lis // copy args to new host memory std::unique_ptr args_addr(new (std::nothrow) uint8_t[args_size_]); + GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_) errno_t sec_ret = memcpy_s(static_cast(args_addr.get()), args_size_, static_cast(kernel_def.args().data()), args_size_); if (sec_ret != EOK) { @@ -501,6 +737,7 @@ Status KernelTaskInfo::InitAicpuTask(const std::map &op_lis GELOGE(RT_FAILED, "Call rt api(rtMalloc) failed, ret: 0x%X", rt_ret); return RT_FAILED; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "cce task physical memory.", args_size_) // copy args to device rt_ret = rtMemcpy(args_, args_size_, static_cast(args_addr.get()), args_size_, RT_MEMCPY_HOST_TO_DEVICE); @@ -515,7 +752,7 @@ Status KernelTaskInfo::InitAicpuTask(const std::map &op_lis sizeof(void *) * input_addrs.size()); } - davinci_model_->SetZeroCopyAddr(io_addrs, static_cast(args_) + sizeof(aicpu::AicpuParamHead)); + davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, static_cast(args_) + sizeof(aicpu::AicpuParamHead)); return SUCCESS; } @@ -564,7 +801,6 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector &input_d GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } - for (std::size_t i = 0; i < output_size; ++i) { rt_ret = rtMemcpy(static_cast(custom_info_.output_descs) + i, sizeof(opTensor_t), const_cast(&input_descs[i]), sizeof(opTensor_t), RT_MEMCPY_HOST_TO_DEVICE); @@ -661,7 +897,6 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u uint64_t &weight_base_addr, uint64_t &var_base_addr, std::string &sm_desc, std::string &flowtable, const domi::KernelDef &kernel_def) { char *sm_contrl = nullptr; - if (!sm_desc.empty()) { sm_contrl = const_cast(sm_desc.data()); } @@ -669,18 +904,14 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u std::string file_name = "libcce.so"; std::string path = PluginManager::GetPath(); path.append(file_name); - char canonicalPath[PATH_MAX] = {0}; - if (path.length() >= PATH_MAX) { - GELOGW("File path is too long."); - return FAILED; - } - if (realpath(path.c_str(), canonicalPath) == nullptr) { + string canonicalPath = RealPath(path.c_str()); + if (canonicalPath.empty()) { GELOGW("failed to get realpath of %s", path.c_str()); return FAILED; } - GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonicalPath); - auto handle = dlopen(canonicalPath, RTLD_NOW | RTLD_GLOBAL); + GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonicalPath.c_str()); + auto handle = dlopen(canonicalPath.c_str(), RTLD_NOW | RTLD_GLOBAL); if (handle == nullptr) { GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); return FAILED; @@ -725,6 +956,7 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "flowtable refresh of cce scence.", flowtable.size()) rt_ret = rtMemcpy(flowtable_, flowtable.size(), flowtable.data(), flowtable.size(), RT_MEMCPY_HOST_TO_DEVICE); if (rt_ret != RT_ERROR_NONE) { @@ -750,5 +982,26 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe return SUCCESS; } +uint8_t KernelTaskInfo::IsL2CpToDDR(uint8_t origain_L2_load_to_ddr) { + if (origain_L2_load_to_ddr == kL2LoadToDdr) { + return kL2LoadToDdr; + } + + if (dump_flag_ == RT_KERNEL_DUMPFLAG) { + return kL2LoadToDdr; + } + + static char *ge_dump_env = std::getenv("DUMP_OP"); + if (ge_dump_env != nullptr) { + static std::string ge_dump_str(ge_dump_env); + static std::string open_ge_dump("1"); + if (ge_dump_str == open_ge_dump) { + return kL2LoadToDdr; + } + } + + return kL2NotLoadToDdr; +} + REGISTER_TASK_INFO(RT_MODEL_TASK_KERNEL, KernelTaskInfo); } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h index c1291e1a..5de622eb 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h @@ -31,6 +31,7 @@ class KernelTaskInfo : public TaskInfo { KernelTaskInfo() : ctx_(), + fusion_op_info_(), stub_func_(nullptr), args_(nullptr), sm_desc_(nullptr), @@ -39,12 +40,20 @@ class KernelTaskInfo : public TaskInfo { args_size_(0), flowtable_size_(0), task_id_(0), + stream_id_(0), so_name_(""), kernel_name_(""), kernel_type_(cce::ccKernelType::CCE_AI_CORE), dump_flag_(RT_KERNEL_DEFAULT), dump_args_(nullptr), - davinci_model_(nullptr) {} + op_desc_(nullptr), + davinci_model_(nullptr), + skt_id_(0), + stub_func_name_(""), + is_l1_fusion_enable_(false), + is_n_batch_spilt_(false), + group_key_(-1), + has_group_key_(false) {} ~KernelTaskInfo() override { davinci_model_ = nullptr; @@ -62,14 +71,21 @@ class KernelTaskInfo : public TaskInfo { cce::ccOpContext *GetCtx() override { return &ctx_; } + FusionOpInfo *GetFusionOpInfo() override { return &fusion_op_info_; } + uint32_t GetTaskID() override { return task_id_; } + uint32_t GetStreamId() override { return stream_id_; } + uintptr_t GetDumpArgs() override { auto ret = reinterpret_cast(dump_args_); return ret; } + uint32_t GetSktTaskID() override { return skt_id_; } + cce::ccOpContext ctx_; + FusionOpInfo fusion_op_info_; private: Status InitTVMTask(DavinciModel *davinci_model, uint16_t offset, const domi::KernelDef &kernel_def); @@ -97,8 +113,24 @@ class KernelTaskInfo : public TaskInfo { Status SetFlowtable(std::string &flowtable, const domi::KernelDef &kernel_def); + uint8_t IsL2CpToDDR(uint8_t origain_L2_load_to_ddr); + static void FreeRtMem(void **ptr); + Status SuperKernelDistribute(); + + // For super kernel + Status SaveSKTDumpInfo(); + void UpdateTaskId(); + void UpdateSKTTaskId(); + Status SKTFinalize(); + Status SuperKernelLaunch(); + Status SaveSuperKernelInfo(); + bool IsMarkedLastNode(); + bool IsMarkedFirstNode(); + bool FirstCallSKTLaunchCheck(); + bool DoubleCallSKTSaveCheck(); + void *stub_func_; void *args_; void *sm_desc_; @@ -107,13 +139,23 @@ class KernelTaskInfo : public TaskInfo { uint32_t args_size_; uint32_t flowtable_size_; uint32_t task_id_; + uint32_t stream_id_; std::string so_name_; std::string kernel_name_; cce::ccKernelType kernel_type_; uint32_t dump_flag_; void *dump_args_; + OpDescPtr op_desc_; DavinciModel *davinci_model_; + // For super kernel + uint32_t skt_id_; + std::string stub_func_name_; + bool is_l1_fusion_enable_; + bool is_n_batch_spilt_; + int64_t group_key_; + bool has_group_key_; + struct AICPUCustomInfo { void *input_descs = nullptr; void *input_addrs = nullptr; @@ -121,6 +163,21 @@ class KernelTaskInfo : public TaskInfo { void *output_addrs = nullptr; void *attr_handle = nullptr; } custom_info_; + + // For super kernel + static struct SuperKernelTaskInfo { + uint32_t last_block_dim; + uint32_t last_args_size; + uint32_t last_task_id; + void *last_stream; + void *last_sm_desc; + std::vector kernel_list; + std::vector arg_list; + uint32_t last_dump_flag; + int64_t last_group_key; + uintptr_t last_dump_args; + OpDescPtr last_op; + } skt_info_; }; } // namespace ge #endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_KERNEL_TASK_INFO_H_ diff --git a/src/ge/graph/load/new_model_manager/task_info/label_goto_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/label_goto_task_info.cc index 0aece056..9124be9f 100644 --- a/src/ge/graph/load/new_model_manager/task_info/label_goto_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/label_goto_task_info.cc @@ -47,6 +47,7 @@ Status LabelGotoTaskInfo::Distribute() { return RT_FAILED; } + GELOGI("LabelGotoTaskInfo Distribute Success."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/label_set_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/label_set_task_info.cc index 397a21bd..75679ec4 100644 --- a/src/ge/graph/load/new_model_manager/task_info/label_set_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/label_set_task_info.cc @@ -53,6 +53,7 @@ Status LabelSetTaskInfo::Distribute() { return RT_FAILED; } + GELOGI("LabelSetTaskInfo Distribute Success."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc index e62228d6..cdd9eb37 100644 --- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc @@ -33,9 +33,6 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da } auto memcpy_async_def = task_def.memcpy_async(); - - GELOGI("InitMemcpyAsyncTaskInfo start."); - uint64_t logic_dst = memcpy_async_def.dst(); uint64_t logic_src = memcpy_async_def.src(); @@ -59,8 +56,7 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da } Status MemcpyAsyncTaskInfo::Distribute() { - GELOGI("MemcpyAsyncTaskInfo Distribute Start."); - GELOGI("Distribute MemcpyAsync, dst_max:%lu, count:%lu, kind:%u.", dst_max_, count_, kind_); + GELOGI("MemcpyAsyncTaskInfo Distribute Start. dst_max:%lu, count:%lu, kind:%u.", dst_max_, count_, kind_); rtError_t rt_ret = rtMemcpyAsync(dst_, dst_max_, src_, count_, static_cast(kind_), stream_); if (rt_ret != RT_ERROR_NONE) { @@ -68,13 +64,14 @@ Status MemcpyAsyncTaskInfo::Distribute() { return RT_FAILED; } + GELOGI("MemcpyAsyncTaskInfo Distribute Success."); return SUCCESS; } Status MemcpyAsyncTaskInfo::GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, uint64_t &base_addr) { GE_CHECK_NOTNULL(davinci_model); - uint64_t data_base_addr = reinterpret_cast(reinterpret_cast(davinci_model->MemBase())) - - davinci_model->GetRtBaseAddr(); + uint64_t data_base_addr = + reinterpret_cast(reinterpret_cast(davinci_model->MemBase())) - davinci_model->GetRtBaseAddr(); uint64_t weight_base_addr = reinterpret_cast(reinterpret_cast(davinci_model->WeightsMemBase())) - davinci_model->GetRtWeightAddr(); uint64_t var_base_addr = reinterpret_cast(reinterpret_cast(davinci_model->VarMemBase())) - diff --git a/src/ge/graph/load/new_model_manager/task_info/profiler_trace_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/profiler_trace_task_info.cc index c7b3deca..1232ddb2 100644 --- a/src/ge/graph/load/new_model_manager/task_info/profiler_trace_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/profiler_trace_task_info.cc @@ -33,18 +33,16 @@ Status ProfilerTraceTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel * } auto log_time_stamp_def = task_def.log_timestamp(); - GELOGI("do InitLogTimeStampTaskInfo"); - log_id_ = log_time_stamp_def.logid(); notify_ = log_time_stamp_def.notify(); flat_ = log_time_stamp_def.flat(); + GELOGI("ProfilerTraceTaskInfo Init Success."); return SUCCESS; } Status ProfilerTraceTaskInfo::Distribute() { - GELOGI("ProfilerTraceTaskInfo Distribute Start."); - GELOGI("rtProfilerTrace: logid = %lu. notify = %d.", log_id_, notify_); + GELOGI("ProfilerTraceTaskInfo Distribute Start. logid = %lu. notify = %d.", log_id_, notify_); rtError_t rt_ret = rtProfilerTrace(log_id_, notify_, flat_, stream_); if (rt_ret != RT_ERROR_NONE) { @@ -52,6 +50,7 @@ Status ProfilerTraceTaskInfo::Distribute() { return RT_FAILED; } + GELOGI("ProfilerTraceTaskInfo Distribute Success."); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc index aa2c3284..3d73b9cb 100644 --- a/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc @@ -36,7 +36,6 @@ Status StreamActiveTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d } auto stream_active_def = task_def.stream_active(); - GELOGI("InitStreamActiveTaskInfo start."); uint32_t op_index = stream_active_def.op_index(); uint32_t internal_index = davinci_model->GetFlowctrlIndex(op_index); @@ -75,7 +74,6 @@ Status StreamActiveTaskInfo::Distribute() { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } - return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc index 5dd3c061..c14a0e1f 100644 --- a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc @@ -41,7 +41,6 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d } auto stream_switch_def = task_def.stream_switch(); - GELOGI("InitStreamSwitchTaskInfo start."); uint32_t op_index = stream_switch_def.op_index(); // get StreamSwitch op @@ -62,8 +61,8 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d size_t input_size = op_desc->GetInputsSize(); if (input_data_addr.size() != STREAM_SWITCH_INPUT_NUM || input_size != STREAM_SWITCH_INPUT_NUM) { - GELOGE(INTERNAL_ERROR, "Input num should be %u. inputAddr size:%zu, inputDesc size:%zu.", - STREAM_SWITCH_INPUT_NUM, input_data_addr.size(), input_size); + GELOGE(INTERNAL_ERROR, "Input num should be %u. inputAddr size:%zu, inputDesc size:%zu.", STREAM_SWITCH_INPUT_NUM, + input_data_addr.size(), input_size); return INTERNAL_ERROR; } @@ -96,7 +95,6 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d } data_type_ = static_cast(data_type); } - return SUCCESS; } @@ -107,7 +105,6 @@ Status StreamSwitchTaskInfo::Distribute() { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } - return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc new file mode 100644 index 00000000..f4f62df0 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.cc @@ -0,0 +1,152 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/load/new_model_manager/task_info/stream_switchn_task_info.h" +#include +#include "framework/common/debug/ge_log.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/load/new_model_manager/davinci_model.h" +#include "graph/load/new_model_manager/model_utils.h" + +namespace { +const uint32_t kDynamicBtachParamNum = 1; +const uint32_t kDynamicResolutionParamNum = 2; +} // namespace + +namespace ge { +Status StreamSwitchNTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { + GELOGI("StreamSwitchNTaskInfo Init Start."); + if (davinci_model == nullptr) { + GELOGE(PARAM_INVALID, "davinci_model is null!"); + return PARAM_INVALID; + } + + Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList()); + if (ret != SUCCESS) { + return FAILED; + } + + auto stream_switchn_def = task_def.stream_switch_n(); + // set size_ + input_size_ = stream_switchn_def.size(); + if (input_size_ != kDynamicBtachParamNum && input_size_ != kDynamicResolutionParamNum) { + GELOGE(FAILED, "The size of dynamic batch or imagesize input is 1 or 2, now it is %u.", input_size_); + return FAILED; + } + + // set value_ptr_ + auto value = stream_switchn_def.target_value(); + if (value.size() == 0) { + GELOGE(FAILED, "The number of gears in dynamic batch scenario can not be 0."); + return FAILED; + } + for (int i = 0; i < value.size(); ++i) { + GELOGD("InitStreamSwitchTaskInfo, valuePtr value[%d]: %ld.", i, value[i]); + value_list_.emplace_back(value[i]); + } + value_ptr_ = &value_list_[0]; + + uint32_t op_index = stream_switchn_def.op_index(); + + // get StreamSwitchN op + auto op_list = davinci_model->GetOpList(); + auto iter = op_list.find(op_index); + if (iter == op_list.end()) { + GELOGE(FAILED, "Index is out of range, index: %u", op_index); + return FAILED; + } + OpDescPtr op_desc = iter->second; + if (op_desc == nullptr) { + GELOGE(FAILED, "SwitchN op is nullptr."); + return FAILED; + } + + // set element_size_ + if (!AttrUtils::GetInt(op_desc, ATTR_NAME_BATCH_NUM, element_size_)) { + GELOGE(FAILED, "Get ATTR_NAME_BATCH_NUM of switchN op failed."); + return FAILED; + } + + if (GetTrueStreamPtr(op_desc, davinci_model) != SUCCESS) { + GELOGE(FAILED, "Get true stream ptr of switchN op failed."); + return FAILED; + } + + // set input_ptr_ + auto input_data_addr = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); + if (input_data_addr.empty()) { + GELOGE(FAILED, "Input data addr is nullptr."); + return FAILED; + } + input_ptr_ = input_data_addr[0]; + GELOGI("StreamSwitchNTaskInfo Init Success, inputSize:%u, elementSize:%d, trueStreamID:%ld.", input_size_, + element_size_, op_desc->GetStreamId()); + + return SUCCESS; +} + +Status StreamSwitchNTaskInfo::Distribute() { + GELOGI("StreamSwitchNTaskInfo Distribute Start."); + rtError_t rt_ret = + rtStreamSwitchN(input_ptr_, input_size_, value_ptr_, true_stream_ptr_, element_size_, stream_, data_type_); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } + + GELOGI("StreamSwitchNTaskInfo Distribute Success. inputSize:%u, elementSize:%d, datatype:%d.", input_size_, + element_size_, data_type_); + return SUCCESS; +} + +Status StreamSwitchNTaskInfo::GetTrueStreamPtr(const OpDescPtr &op_desc, DavinciModel *davinci_model) { + vector true_stream_id_list; + if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, true_stream_id_list)) { + GELOGE(FAILED, "StreamSwitchNOp get attr ACTIVE_STREAM_LIST fail."); + return FAILED; + } + + if (true_stream_id_list.size() > davinci_model->GetStreamList().size()) { + GELOGE(FAILED, + "InitStreamSwitchNTaskInfo get true stream id list failed. true stream size:%zu, " + "stream list size:%zu.", + true_stream_id_list.size(), davinci_model->GetStreamList().size()); + return FAILED; + } + + // set true_stream_ptr_ + for (size_t i = 0; i < true_stream_id_list.size(); ++i) { + uint32_t true_stream_id = true_stream_id_list[i]; + if (true_stream_id >= davinci_model->GetStreamList().size()) { + GELOGE(FAILED, "InitStreamSwitchNTaskInfo stream id invalid. id:%u, stream list size:%zu.", true_stream_id, + davinci_model->GetStreamList().size()); + return FAILED; + } + rtStream_t true_stream = davinci_model->GetStreamList()[true_stream_id]; + true_stream_list_.emplace_back(true_stream); + GELOGD("InitStreamSwitchTaskInfo, trueStreamList index: %zu.", i); + } + + if (true_stream_list_.empty()) { + GELOGE(FAILED, "true stream list is null."); + return FAILED; + } + true_stream_ptr_ = &true_stream_list_[0]; + return SUCCESS; +} + +REGISTER_TASK_INFO(RT_MODEL_TASK_STREAM_SWITCH_N, StreamSwitchNTaskInfo); +} // namespace ge \ No newline at end of file diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.h b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.h new file mode 100644 index 00000000..d1002da7 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/stream_switchn_task_info.h @@ -0,0 +1,52 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCHN_TASK_INFO_H_ +#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCHN_TASK_INFO_H_ + +#include "graph/load/new_model_manager/task_info/task_info.h" +#include "graph/op_desc.h" + +namespace ge { +class StreamSwitchNTaskInfo : public TaskInfo { + public: + StreamSwitchNTaskInfo() + : input_ptr_(nullptr), + input_size_(0), + value_ptr_(nullptr), + true_stream_ptr_(nullptr), + element_size_(0), + data_type_(RT_SWITCH_INT64) {} + + ~StreamSwitchNTaskInfo() override {} + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override; + + Status Distribute() override; + + private: + Status GetTrueStreamPtr(const OpDescPtr &op_desc, DavinciModel *davinci_model); + void *input_ptr_; + uint32_t input_size_; + void *value_ptr_; + rtStream_t *true_stream_ptr_; + uint32_t element_size_; + rtSwitchDataType_t data_type_; + vector true_stream_list_; + vector value_list_; +}; +} // namespace ge +#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCHN_TASK_INFO_H_ diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc new file mode 100644 index 00000000..38dbd8b3 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc @@ -0,0 +1,39 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "super_kernel.h" +#include "framework/common/debug/ge_log.h" + +namespace ge { +namespace skt { +Status SuperKernel::Launch(rtStream_t stream, bool dump_flag) { + const void *func_stub_ = this->GetFuncStub(); + + const void *args[] = {this->GetNavTablePtr(), (const void *)this->GetNavTableSize()}; + + void *device_args_addr = nullptr; + rtError_t rt_ret = rtMalloc((void **)&(device_args_addr), sizeof(args), RT_MEMORY_HBM); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;) + rt_ret = rtMemcpy((void *)device_args_addr, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;) + rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr, sizeof(args), NULL, stream, + dump_flag); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelLaunchWithFlag failied. error: 0x%X", rt_ret); + return FAILED;) + return SUCCESS; +} +} // namespace skt +} // namespace ge diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h new file mode 100644 index 00000000..b662d97b --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h @@ -0,0 +1,45 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SUPER_KERNEL_H +#define SUPER_KERNEL_H + +#include "framework/common/fmk_error_codes.h" +#include "framework/common/debug/log.h" +#include "runtime/rt.h" + +namespace ge { +namespace skt { +class SuperKernel { + private: + const void *func_stub_; + void *dev_nav_table_; + uint64_t nav_table_size_; + uint32_t block_dim_; + + public: + SuperKernel(const void *stub, void *ptr, uint64_t sz, uint32_t dim) + : func_stub_(stub), dev_nav_table_(ptr), nav_table_size_(sz), block_dim_(dim) {} + ~SuperKernel() {} + Status Launch(rtStream_t stream, bool dump_flag); + const void *GetFuncStub() const { return func_stub_; } + const void *GetNavTablePtr() const { return dev_nav_table_; } + uint64_t GetNavTableSize() const { return nav_table_size_; } + uint32_t GetBlockDim() const { return block_dim_; } +}; +} // namespace skt +} // namespace ge +#endif diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc new file mode 100644 index 00000000..ab3f68f1 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc @@ -0,0 +1,160 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "super_kernel_factory.h" +#include "framework/common/debug/ge_log.h" + +namespace ge { +namespace skt { +SuperKernelFactory &SuperKernelFactory::GetInstance() { + static SuperKernelFactory factory; + return factory; +} + +Status SuperKernelFactory::Init() { + if (!is_init_) { + rtError_t rt_ret; + rt_ret = rtGetFunctionByName(this->sk_stub_name_.c_str(), &this->func_stub_); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, + "rtGetFunctionByName " + "failied. stub_func: %s", + this->sk_stub_name_.c_str()); + return FAILED;) + rt_ret = rtGetAddrByFun(this->func_stub_, &this->func_ptr_); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret); + return FAILED;) + if (this->use_physical_address_ != nullptr) { + void *skt_func = nullptr; + rt_ret = rtKernelConfigTransArg(this->func_ptr_, sizeof(uint64_t), 0, &skt_func); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + return FAILED;) + GELOGD( + "SKT: fuseKernels super_kernel_template subFunc %p, device func " + "address %p, device physic PC %p", + (uint64_t)this->func_stub_, (uint64_t)this->func_ptr_, (uint64_t)skt_func); + } else { + GELOGD( + "SKT: fuseKernels super_kernel_template subFunc %p, device func " + "address %p", + (uint64_t)this->func_stub_, (uint64_t)this->func_ptr_); + } + } + is_init_ = true; + + return SUCCESS; +} + +Status SuperKernelFactory::Uninitialize() { + is_init_ = false; + func_stub_ = nullptr; + func_ptr_ = nullptr; + return SUCCESS; +} + +Status SuperKernelFactory::FuseKernels(const std::vector &stub_func_list, + const std::vector &args_addr_list, uint32_t block_dim, SuperKernel *&h) { + // Iterate through the ops to be fused + // Each subkernel to be fused contains 2 fields: fn address offset, args + // address. + // Generate the nav table contents. The format is as follows: + // [[fn_ptr_address, args_addr1], [fn_ptr_address2, args_addr2], + // ...] + if (this->func_stub_ == nullptr) { + GELOGW("SKT: func_stub_ is empty. Please make sure init() is run first"); + return FAILED; + } + + size_t super_kernel_size = stub_func_list.size(); + if (super_kernel_size != args_addr_list.size()) { + GELOGW("SKT: The size of stub_func_list doesn't match args_addr_list"); + return FAILED; + } + + if (super_kernel_size < 2) { + GELOGW( + "SKT: the number of kernels being fused must be greater than or " + "equal to 2"); + return FAILED; + } + GELOGI("SKT: superkernel start fuse, superkernel size %d.", stub_func_list.size()); + uint64_t nav_table[2 * stub_func_list.size()]; + uint64_t nav_table_size = 2 * stub_func_list.size() * sizeof(int64_t); + + rtError_t rt_ret; + if (this->use_physical_address_ != nullptr) { + for (unsigned i = 0; i < stub_func_list.size(); i++) { + void *sub_device_func = nullptr; + rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret); + return FAILED;) + void *sub_device_func_pys = nullptr; + void *args_addr_pys = nullptr; + rt_ret = rtKernelConfigTransArg(sub_device_func, sizeof(uint64_t), 0, &sub_device_func_pys); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + return FAILED;) + rt_ret = rtKernelConfigTransArg(args_addr_list[i], sizeof(uint64_t), 0, &args_addr_pys); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + return FAILED;) + GELOGD( + "SKT: fuseKernels subFunc %p, device func address %p, device " + "physic func address %p", + stub_func_list[i], (uint64_t)sub_device_func, (uint64_t)sub_device_func_pys); + nav_table[i * 2] = (uint64_t)sub_device_func_pys / 4; + GELOGD("SKT: CALL offet %p", nav_table[i * 2]); + nav_table[i * 2 + 1] = (uint64_t)args_addr_pys; + GELOGD("SKT: fuseKernels args base address %p", nav_table[i * 2 + 1]); + } + + void *hbm_nav_table_addr = nullptr; + void *hbm_nav_table_addr_pys = nullptr; + rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;) + rt_ret = + rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;) + rt_ret = rtKernelConfigTransArg(hbm_nav_table_addr, sizeof(uint64_t), 0, &hbm_nav_table_addr_pys); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + return FAILED;) + + GELOGD("SKT: hbm_nav_table_addr %p, hbm_nav_table_addr_pys %p", (uint64_t)hbm_nav_table_addr, + (uint64_t)hbm_nav_table_addr_pys); + // Create the necessary metadata for the super kernel + h = new SuperKernel(this->func_stub_, hbm_nav_table_addr_pys, nav_table_size, block_dim); + } else { + for (unsigned i = 0; i < stub_func_list.size(); i++) { + void *sub_device_func = nullptr; + rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret); + return FAILED;) + GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], (uint64_t)sub_device_func); + nav_table[i * 2] = (uint64_t)sub_device_func / 4; + GELOGD("SKT: CALL offet %p", nav_table[i * 2]); + nav_table[i * 2 + 1] = (uint64_t)args_addr_list[i]; + GELOGD("SKT: fuseKernels args base address %p", nav_table[i * 2 + 1]); + } + void *hbm_nav_table_addr = nullptr; + rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;) + rt_ret = + rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;) + // Create the necessary metadata for the super kernel + h = new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim); + } + return SUCCESS; +} +} // namespace skt +} // namespace ge diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h new file mode 100644 index 00000000..7b59d4bf --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h @@ -0,0 +1,47 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SUPER_KERNEL_FACTORY_H +#define SUPER_KERNEL_FACTORY_H + +#include +#include "super_kernel.h" +#include "framework/common/debug/log.h" + +namespace ge { +namespace skt { +class SuperKernelFactory { + private: + void *func_stub_ = nullptr; + void *func_ptr_ = nullptr; + std::string sk_stub_name_ = "_Z21super_kernel_templatePmm"; + const char *use_physical_address_ = getenv("GE_USE_PHYSICAL_ADDRESS"); + bool is_init_ = false; + SuperKernelFactory(){}; + + public: + SuperKernelFactory(SuperKernelFactory const &) = delete; + void operator=(SuperKernelFactory const &) = delete; + static SuperKernelFactory &GetInstance(); + SuperKernelFactory(const std::string &sk_stub_name_, const std::string &bin_file); + Status Init(); + Status Uninitialize(); + Status FuseKernels(const std::vector &stub_func_list, const std::vector &args_addr_list, + uint32_t block_dim, SuperKernel *&h); +}; +} // namespace skt +} // namespace ge +#endif diff --git a/src/ge/graph/load/new_model_manager/task_info/task_info.h b/src/ge/graph/load/new_model_manager/task_info/task_info.h index 09ba05de..2a0b93c7 100644 --- a/src/ge/graph/load/new_model_manager/task_info/task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/task_info.h @@ -38,10 +38,18 @@ struct RuntimeParam { uint32_t batch_num = 0; uint32_t stream_num = 0; uint32_t event_num = 0; + uint32_t label_num = 0; uint64_t session_id = 0; uint32_t graph_id = 0; }; +typedef struct FusionOpInfo { + vector original_op_names; + string op_name; + uint32_t op_index; + uint32_t stream_id; +} FusionOpInfo; + class DavinciModel; class TaskInfo { @@ -60,8 +68,14 @@ class TaskInfo { virtual uint32_t GetTaskID() { return 0xFFFFFFFF; } + virtual uint32_t GetStreamId() { return 0xFFFFFFFF; } + virtual uintptr_t GetDumpArgs() { return 0; } + virtual uint32_t GetSktTaskID() { return 0xFFFFFFFF; } + + virtual FusionOpInfo *GetFusionOpInfo() { return nullptr; } + protected: Status SetStream(uint32_t stream_id, const std::vector &stream_list); diff --git a/src/ge/graph/load/new_model_manager/tbe_handle_store.cc b/src/ge/graph/load/new_model_manager/tbe_handle_store.cc index 15967ad2..5bdf4c81 100644 --- a/src/ge/graph/load/new_model_manager/tbe_handle_store.cc +++ b/src/ge/graph/load/new_model_manager/tbe_handle_store.cc @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "graph/load/new_model_manager/tbe_handle_store.h" +#include "tbe_handle_store.h" #include #include "common/ge_inner_error_codes.h" diff --git a/src/ge/graph/load/new_model_manager/tbe_handle_store.h b/src/ge/graph/load/new_model_manager/tbe_handle_store.h index 3583064b..a8f68514 100644 --- a/src/ge/graph/load/new_model_manager/tbe_handle_store.h +++ b/src/ge/graph/load/new_model_manager/tbe_handle_store.h @@ -96,4 +96,4 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY TBEHandleStore { }; } // namespace ge -#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TBE_HANDLE_STORE_H_ +#endif // NEW_GE_TBE_HANDLE_STORE_H diff --git a/src/ge/graph/load/output/output.cc b/src/ge/graph/load/output/output.cc index 8351715c..d922ce7c 100644 --- a/src/ge/graph/load/output/output.cc +++ b/src/ge/graph/load/output/output.cc @@ -67,7 +67,7 @@ Status Output::Init() { } for (size_t i = 0; i < input_num_; i++) { - uint32_t tensor_size = 0; + int64_t tensor_size = 0; auto input_desc = op_desc_->GetInputDescPtr(i); GE_CHECK_NOTNULL(input_desc); Status ret = TensorUtils::GetSize(*input_desc, tensor_size); @@ -84,6 +84,8 @@ Status Output::Init() { } } + GELOGI("Init output:%lu, %lu, %lu", input_num_, v_input_size_.size(), v_input_data_addr_.size()); + return SUCCESS; } @@ -95,6 +97,13 @@ Status Output::Init() { /// Status Output::CopyResult(OutputData &rslt, uint32_t data_begin, uint32_t &data_index, bool support_mem_share) { uint32_t data_count = 0; + if (input_num_ > rslt.blobs.size() - data_begin) { + GELOGE(FAILED, "Tensor num %zu, data_buf num: %zu.", input_num_, rslt.blobs.size() - data_begin); + return FAILED; + } else if (input_num_ < rslt.blobs.size() - data_begin) { + GELOGW("Tensor num %zu, data_buf num: %zu.", input_num_, rslt.blobs.size() - data_begin); + } + for (size_t i = 0; i < input_num_; i++) { DataBuffer data_buf = rslt.blobs[data_begin + data_count]; Status ret = SetDataBuf(data_buf, data_count, i, support_mem_share); @@ -123,10 +132,11 @@ Status Output::SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i, } if (data_buf.isDataSupportMemShare && support_mem_share) { - GELOGD("No need to copy input data, user's output data buffer can be shared."); + GELOGI("No need to copy input data, user's output data buffer can be shared."); } else { // Copy result to Databuf - uint32_t size = v_input_size_[i]; + int64_t size = v_input_size_[i]; + GELOGI("Tensor data size before: %ld", size); graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(*tensor_desc, size); if (graph_status != ge::GRAPH_SUCCESS) { @@ -134,12 +144,19 @@ Status Output::SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i, return FAILED; } + if (data_buf.length < size) { + GELOGE(FAILED, "Tensor data size: %ld data_buf length: %ld", size, data_buf.length); + return FAILED; + } else if (data_buf.length > size) { + GELOGW("Tensor data size: %ld data_buf length: %ld", size, data_buf.length); + } + rtError_t rt_ret = rtMemcpy(data_buf.data, size, v_input_data_addr_[i], size, RT_MEMCPY_DEVICE_TO_HOST); if (rt_ret != RT_ERROR_NONE) { GELOGE(rt_ret, "rtmemcpy error"); return FAILED; } - GELOGD("Tensor data size: %u data_buflength: %u", size, data_buf.length); + GELOGI("Tensor data size: %ld data_buf length: %ld", size, data_buf.length); } ++data_count; @@ -149,7 +166,7 @@ Status Output::SetDataBuf(DataBuffer &data_buf, uint32_t &data_count, size_t i, return SUCCESS; } -void Output::GetOutputData(vector &v_data_addr, vector &v_data_size) { +void Output::GetOutputData(vector &v_data_addr, vector &v_data_size) { for (size_t i = 0; i < input_num_; ++i) { v_data_addr.push_back(v_input_data_addr_[i]); v_data_size.push_back(v_input_size_[i]); diff --git a/src/ge/graph/load/output/output.h b/src/ge/graph/load/output/output.h index cd74a59d..d93b8de9 100644 --- a/src/ge/graph/load/output/output.h +++ b/src/ge/graph/load/output/output.h @@ -23,13 +23,12 @@ #include "common/debug/log.h" #include "common/op/attr_value_util.h" #include "common/op/ge_op_utils.h" -#include "common/op/op_parser_util.h" #include "common/types.h" #include "common/util.h" #include "common/ge_types.h" -#include "graph/debug/ge_attr_define.h" #include "graph/load/new_model_manager/davinci_model.h" #include "graph/op_desc.h" +#include "graph/debug/ge_attr_define.h" namespace ge { using std::string; @@ -68,7 +67,7 @@ class Output { /// @brief Get Output data and size. /// @return void /// - void GetOutputData(vector &v_data_addr, vector &v_data_size); + void GetOutputData(vector &v_data_addr, vector &v_data_size); // Copy assignment operator and copy constructor are deleted Output &operator=(const Output &output) = delete; @@ -88,7 +87,7 @@ class Output { // Input descriptions size_t input_num_; vector v_input_data_addr_; // init as:buf_base + op_def_->input(i)); - vector v_input_size_; + vector v_input_size_; }; } // namespace ge diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc index 9796b2ac..f6fc8389 100644 --- a/src/ge/graph/manager/graph_manager.cc +++ b/src/ge/graph/manager/graph_manager.cc @@ -47,7 +47,6 @@ #include "graph/passes/identify_reference_pass.h" #include "graph/passes/link_gen_mask_nodes_pass.h" #include "graph/passes/multi_batch_pass.h" -#include "graph/passes/no_reshape_op_remove_pass.h" #include "graph/passes/permute_pass.h" #include "graph/passes/reshape_remove_pass.h" #include "graph/passes/same_transdata_breadth_fusion_pass.h" @@ -55,9 +54,12 @@ #include "graph/passes/transop_depth_fusion_pass.h" #include "graph/passes/transop_nearby_allreduce_fusion_pass.h" #include "graph/passes/transop_without_reshape_fusion_pass.h" +#include "graph/passes/cast_remove_pass.h" #include "graph/passes/transpose_transdata_pass.h" #include "graph/passes/variable_op_pass.h" +#include "graph/passes/variable_prepare_op_pass.h" #include "graph/passes/variable_ref_delete_op_pass.h" +#include "graph/passes/replace_with_empty_const_pass.h" #include "graph/utils/tensor_adapter.h" #include "inc/pass_manager.h" #include "init/gelib.h" @@ -81,16 +83,11 @@ Status GraphManager::Initialize(const std::map &options) { } // malloc - graph_run_listener_ = MakeShared(); + graph_run_listener_ = MakeShared(sync_run_mutex_, condition_); if (graph_run_listener_ == nullptr) { GELOGE(MEMALLOC_FAILED, "Make shared failed"); return MEMALLOC_FAILED; } - Status ret = graph_run_listener_->SetCondition(&sync_run_mutex_, &condition_); - if (ret != SUCCESS) { - GELOGE(ret, "[Initialize] mutex and cond is invalid."); - return ret; - } // graph context graph_context_ = MakeShared(); if (graph_context_ == nullptr) { @@ -99,7 +96,7 @@ Status GraphManager::Initialize(const std::map &options) { } // parse option parameters - ret = ParseOptions(options); + Status ret = ParseOptions(options); if (ret != SUCCESS) { GELOGE(ret, "[Initialize] parse options failed."); return ret; @@ -135,7 +132,9 @@ Status GraphManager::Finalize() { return SUCCESS; } - GE_CHK_STATUS_RET(graph_executor_.FreeExecuteMemory()); + if (graph_executor_.FreeExecuteMemory() != SUCCESS) { + GELOGW("Graph executor FreeExecuteMemory failed, resources may not be released correctly."); + } StopQueue(this); @@ -240,10 +239,10 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, return SUCCESS; } -Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const std::vector &sub_graph_list) { +Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph) { std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr != nullptr && instance_ptr->InitFlag()) { - Status ret = graph_partitioner_.MergeAfterSubGraphOptimization(compute_graph, sub_graph_list); + Status ret = graph_partitioner_.MergeAfterSubGraphOptimization(compute_graph, original_compute_graph); if (ret != SUCCESS) { GELOGE(ret, "merge end and placeholder after subGraph optimization failed."); return FAILED; @@ -254,10 +253,54 @@ Status GraphManager::MergeSubGraph(ComputeGraphPtr &compute_graph, const std::ve GELOGE(ret_topo, "[GraphManager]: TopologicalSorting the merged graph failed."); return ret_topo; } - } else if (!sub_graph_list.empty() && (sub_graph_list[0] != nullptr)) { - compute_graph = sub_graph_list[0]->GetSubGraph(); + } else { + auto subgraph_list = graph_partitioner_.GetSubGraphMap(); + if (subgraph_list.find(original_compute_graph) != subgraph_list.end() && + !subgraph_list[original_compute_graph].empty() && subgraph_list[original_compute_graph][0] != nullptr) { + compute_graph = subgraph_list[original_compute_graph][0]->GetSubGraph(); + } + } + + return SUCCESS; +} + +Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph) { + // use default 16 multi thread + const uint32_t thread_num = 16; + ThreadPool executor(thread_num); + auto sub_graph_map = graph_partitioner_.GetSubGraphMap(); + std::vector> vector_future; + const auto &root_subgraph_list = sub_graph_map[compute_graph]; + for (const auto &subgraph : root_subgraph_list) { + std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id, + GetThreadLocalContext()); + if (!f.valid()) { + GELOGE(FAILED, "Future is invalid"); + return FAILED; + } + vector_future.emplace_back(std::move(f)); } + for (auto &function_graph : compute_graph->GetAllSubgraphs()) { + auto subgraph_list = sub_graph_map[function_graph]; + for (const auto &subgraph : subgraph_list) { + std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, subgraph, session_id, + GetThreadLocalContext()); + if (!f.valid()) { + GELOGE(FAILED, "Future is invalid"); + return FAILED; + } + vector_future.emplace_back(std::move(f)); + } + } + GELOGI("All sub graph num is %zu", vector_future.size()); + for (size_t i = 0; i < vector_future.size(); ++i) { + Status ret_status = vector_future[i].get(); + if (ret_status != SUCCESS) { + GELOGE(ret_status, "subgraph %zu optimize failed", i); + return ret_status; + } + } return SUCCESS; } @@ -272,6 +315,7 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetDirectNodesSize()); // optimize the summary op in graph: store the summary name and replace the summary ops with net_output op. GE_TIMESTAMP_START(HandleSummaryOp); auto ret = graph_optimize_.HandleSummaryOp(compute_graph); @@ -295,52 +339,43 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector sub_graph_list; + // all sub graph list of root graph and sub graph GE_TIMESTAMP_START(GraphPartition); - ret = graph_partitioner_.Partition(compute_graph, sub_graph_list, GraphPartitioner::kPartitioning); + ret = graph_partitioner_.Partition(compute_graph, GraphPartitioner::kPartitioning); if (ret != SUCCESS) { GELOGE(ret, "Graph partition Failed"); return ret; } GE_TIMESTAMP_END(GraphPartition, "GraphPartitioner::Partition1"); GE_TIMESTAMP_START(SetSubgraph); - // use default 16 multi thread - const uint32_t thread_num = 16; - ThreadPool executor(thread_num); - size_t sub_graph_list_size = sub_graph_list.size(); - std::vector> vector_future(sub_graph_list_size); - for (size_t i = 0; i < sub_graph_list_size; ++i) { - std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, sub_graph_list[i], - session_id, GetThreadLocalContext()); - if (!f.valid()) { - GELOGE(FAILED, "Future is invalid"); - return FAILED; - } - vector_future[i] = std::move(f); - } - for (size_t i = 0; i < vector_future.size(); ++i) { - Status ret_status = vector_future[i].get(); - if (ret_status != SUCCESS) { - GELOGE(ret_status, "subgraph %zu optimize failed", i); - return ret_status; - } + ret = SetSubgraph(session_id, compute_graph); + if (ret != SUCCESS) { + GELOGE(ret, "Graph set subgraph Failed"); + return ret; } GE_TIMESTAMP_END(SetSubgraph, "SetSubGraph"); ComputeGraphPtr merged_compute_graph = nullptr; + std::vector merged_sub_graph_list; GE_TIMESTAMP_START(MergeSubgraph); - ret = MergeSubGraph(merged_compute_graph, sub_graph_list); + ret = MergeSubGraph(merged_compute_graph, compute_graph); if (ret != SUCCESS) { GELOGE(ret, "Merge SubGraph Failed"); return ret; } merged_compute_graph->SetSessionID(session_id); merged_compute_graph->SetGraphID(graph_node->GetGraphId()); - GE_TIMESTAMP_END(MergeSubgraph, "GraphManager::MergeSubGraph"); - GraphUtils::DumpGEGraph(merged_compute_graph, "mergedComputeGraph"); GraphUtils::DumpGEGraphToOnnx(*merged_compute_graph, "mergedComputeGraph"); + for (auto &sub_graph : merged_compute_graph->GetAllSubgraphs()) { + string subgraph_name = "mergedComputeGraph" + sub_graph->GetName(); + sub_graph->SetSessionID(session_id); + sub_graph->SetGraphID(graph_node->GetGraphId()); + GraphUtils::DumpGEGraph(merged_compute_graph, subgraph_name); + GraphUtils::DumpGEGraphToOnnx(*merged_compute_graph, subgraph_name); + } + GE_TIMESTAMP_END(MergeSubgraph, "GraphManager::MergeSubGraph"); std::shared_ptr instance_ge = ge::GELib::GetInstance(); if (instance_ge != nullptr && instance_ge->InitFlag()) { @@ -353,7 +388,6 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetGraphId())); merged_compute_graph->SetName(graph_name); } + std::vector sub_graph_list; ret = graph_builder_.Build(merged_compute_graph, sub_graph_list, ge_model, session_id); if (ret != SUCCESS) { GELOGE(ret, "SubGraph build Failed."); @@ -514,10 +549,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorGetNeedIteration(), - compute_graph_tmp->SetNeedIteration(GraphUtils::CheckIsTrainGraph(compute_graph_tmp));)) + return GE_GRAPH_GRAPH_NODE_NULL;)) std::vector ge_models; @@ -831,9 +863,8 @@ Status GraphManager::ParseOptions(const std::map &opti } // Set save_original_model flag (ge.save_original_model) - GE_CHK_STATUS_RET(ParseOption(options, SAVE_ORIGINAL_MODEL, options_.save_original_model), - "Set save original model flag fail"); - GELOGI("Set save original model flag %s", options_.save_original_model ? "true" : "false"); + ParseOption(options, SAVE_ORIGINAL_MODEL, options_.save_original_model); + GELOGI("Set save original model flag %s", options_.save_original_model.c_str()); // Original model file name ParseOption(options, ORIGINAL_MODEL_FILE, options_.original_model_file); @@ -1323,28 +1354,24 @@ bool GraphManager::ConfirmUseOpAndIndexByNode(const ge::NodePtr &var_node, return false; } -void GraphManager::ResetConstType(ge::ComputeGraphPtr &compute_graph) { - if (options_.train_graph_flag) { - for (ge::NodePtr &n : compute_graph->GetDirectNode()) { - if (n->GetOpDesc() == nullptr) { - continue; - } - if (n->GetOpDesc()->GetType() == CONSTANT) { - n->GetOpDesc()->SetType(CONSTANTOP); - } +Status GraphManager::RemoveIsolatedConst(ge::ComputeGraphPtr &compute_graph) { + for (ge::NodePtr &n : compute_graph->GetAllNodes()) { + if (n->GetOpDesc() == nullptr) { + continue; } - } - - if (!options_.train_graph_flag) { - for (ge::NodePtr &n : compute_graph->GetDirectNode()) { - if (n->GetOpDesc() == nullptr) { - continue; - } - if (n->GetOpDesc()->GetType() == CONSTANTOP) { - n->GetOpDesc()->SetType(CONSTANT); + if (n->GetOpDesc()->GetType() == CONSTANT || n->GetOpDesc()->GetType() == CONSTANTOP) { + // reset const type depend on train_flag + options_.train_graph_flag ? n->GetOpDesc()->SetType(CONSTANTOP) : n->GetOpDesc()->SetType(CONSTANT); + if (n->GetOutAllNodes().empty() && n->GetInAllNodes().empty()) { + // it is an isolated constant, just remove it + if (GraphUtils::RemoveJustNode(compute_graph, n) != GRAPH_SUCCESS) { + GELOGE(FAILED, "remove constant %s failed.", n->GetName().c_str()); + return FAILED; + } } } } + return SUCCESS; } Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_graph) { @@ -1354,8 +1381,8 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra NamesToPass names_to_passes_for_shape; IdentifyReferencePass identify_reference_pass; names_to_passes_for_shape.emplace_back("IdentifyReferencePass", &identify_reference_pass); - NoReshapeOpRemovePass no_reshape_op_remove_pass; - names_to_passes_for_shape.emplace_back("NoReshapeOpRemovePass", &no_reshape_op_remove_pass); + CastRemovePass cast_remove_pass; + names_to_passes_for_shape.emplace_back("CastRemovePass", &cast_remove_pass); TransposeTransDataPass transpose_transdata_pass; names_to_passes_for_shape.emplace_back("TransposeTransDataPass", &transpose_transdata_pass); GE_TIMESTAMP_START(ge_passes_for_shape); @@ -1372,6 +1399,7 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra } PassManager after_merge_passes; GE_CHK_STATUS_RET(after_merge_passes.AddPass(new (std::nothrow) PermutePass)) + GE_CHK_STATUS_RET(after_merge_passes.AddPass(new (std::nothrow) VariablePrepareOpPass)) GE_IF_BOOL_EXEC(options == "default" || options == "1", GELOGI("turn on variable accelerator"); GE_CHK_STATUS_RET(after_merge_passes.AddPass(new (std::nothrow) VariableOpPass(&var_acc_ctrl_)))) GE_CHK_STATUS_RET(after_merge_passes.AddPass(new (std::nothrow) TransOpDepthFusionPass)) @@ -1411,6 +1439,8 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra names_to_passes.emplace_back("ReshapeRemovePass", &trans_op_nearby_allreduce_fusion_pass); ReshapeRemovePass reshape_remove_pass; names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass); + ReplaceWithEmptyConstPass replace_with_empty_const_pass; + names_to_passes.emplace_back("ReplaceWithEmptyConstPass", &replace_with_empty_const_pass); ConstantFoldingPass constant_folding_pass; names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); DimensionAdjustPass dimension_adjust_pass; @@ -1423,7 +1453,11 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra return ret; } - ResetConstType(compute_graph); + ret = RemoveIsolatedConst(compute_graph); + if (ret != SUCCESS) { + GELOGE(ret, "Remove isolated Constant failed, ret:%d.", ret); + return ret; + } PassManager pass_for_control_attr_optimize; GE_CHK_STATUS_RET(pass_for_control_attr_optimize.AddPass(new (std::nothrow) MultiBatchPass)) @@ -1493,7 +1527,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra "CheckAndReleaseMemory Graph[%u] need memory_size[%ld], weight_size[%ld]," " Device[%u] free_memory_size[%ld]", graph_node->GetGraphId(), memory_size, weight_size, GetContext().DeviceId(), free_memory); - if (CheckInt64AddOverflow(memory_size, weight_size) != SUCCESS) { + if (ge::CheckInt64AddOverflow(memory_size, weight_size) != SUCCESS) { GELOGE(INTERNAL_ERROR, "The sum of Memory size and weight size exceeds INT64_MAX"); return INTERNAL_ERROR; } @@ -1545,14 +1579,15 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra return SUCCESS; } -Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, SubGraphInfoPtr &sub_graph_info_ptr, - uint64_t session_id, const GEThreadLocalContext &ge_context) { +Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, + const SubGraphInfoPtr &sub_graph_info_ptr, uint64_t session_id, + const GEThreadLocalContext &ge_context) { Status ret = SUCCESS; GetThreadLocalContext() = ge_context; if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) { ComputeGraphPtr compute_graph_tmp = sub_graph_info_ptr->GetSubGraph(); const std::string &engine_name = sub_graph_info_ptr->GetEngineName(); - GELOGD("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu", + GELOGI("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu", compute_graph_tmp != nullptr ? compute_graph_tmp->GetName().c_str() : "", engine_name.c_str(), pthread_self()); GraphUtils::DumpGEGraph(compute_graph_tmp, "OptimizeSubGraphBefore"); @@ -1563,11 +1598,13 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager if (ret != SUCCESS) { GELOGE(ret, "SubGraph optimize Failed %s", engine_name.c_str()); return ret; + } else { + GELOGI("SubGraph optimize success %s", engine_name.c_str()); } GraphUtils::DumpGEGraph(compute_graph_tmp, "OptimizeSubGraphAfter"); GraphUtils::DumpGEGraphToOnnx(*compute_graph_tmp, "OptimizeSubGraphAfter"); sub_graph_info_ptr->SetSubGraph(compute_graph_tmp); - GELOGD("ProcessSubGraphWithMultiThreads end, graph name is %s, engine_name is %s, thread id is %lu", + GELOGI("ProcessSubGraphWithMultiThreads end, graph name is %s, engine_name is %s, thread id is %lu", compute_graph_tmp != nullptr ? compute_graph_tmp->GetName().c_str() : "", engine_name.c_str(), pthread_self()); } else { @@ -1647,10 +1684,6 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { graph_node->Unlock(); return; } - - if (!compute_graph_tmp->GetNeedIteration()) { - compute_graph_tmp->SetNeedIteration(GraphUtils::CheckIsTrainGraph(compute_graph_tmp)); - } } std::vector ge_models; diff --git a/src/ge/graph/manager/graph_manager.h b/src/ge/graph/manager/graph_manager.h index c76e2e7c..5a296b91 100644 --- a/src/ge/graph/manager/graph_manager.h +++ b/src/ge/graph/manager/graph_manager.h @@ -29,7 +29,7 @@ #include "common/ge_inner_error_codes.h" #include "external/graph/types.h" #include "ge/ge_api_types.h" -#include "graph/build/graph_build.h" +#include "graph/build/graph_builder.h" #include "graph/execute/graph_execute.h" #include "graph/ge_local_context.h" #include "graph/load/graph_loader.h" @@ -175,7 +175,7 @@ class GraphManager { std::shared_ptr GetModelListener() const { return graph_run_listener_; } - static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, SubGraphInfoPtr &sub_graph_info_ptr, + static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, const SubGraphInfoPtr &sub_graph_info_ptr, uint64_t session_id, const GEThreadLocalContext &ge_context); Status PreRun(const GraphNodePtr &graph_node, const std::vector &inputs, vector &ge_models, GeModelPtr &ge_model, uint64_t session_id = INVALID_SESSION_ID); @@ -227,7 +227,9 @@ class GraphManager { bool CheckTransOpForCheckpointGraph(NodePtr &node); - Status MergeSubGraph(ComputeGraphPtr &compute_graph, const std::vector &sub_graph_list); + Status MergeSubGraph(ComputeGraphPtr &compute_graph, const ge::ComputeGraphPtr &original_compute_graph); + + Status SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph); bool IsBroadCastOpData(const ge::NodePtr &var_node); @@ -246,7 +248,7 @@ class GraphManager { // graph context std::shared_ptr GetGraphContext() const { return graph_context_; } - void ResetConstType(ge::ComputeGraphPtr &compute_graph); + Status RemoveIsolatedConst(ge::ComputeGraphPtr &compute_graph); Status OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_graph); diff --git a/src/ge/graph/manager/graph_manager_utils.cc b/src/ge/graph/manager/graph_manager_utils.cc index 3666b5c5..021c0c47 100644 --- a/src/ge/graph/manager/graph_manager_utils.cc +++ b/src/ge/graph/manager/graph_manager_utils.cc @@ -21,8 +21,8 @@ #include "framework/common/debug/ge_log.h" #include "common/ge/ge_util.h" -#include "graph/debug/ge_attr_define.h" #include "common/string_util.h" +#include "graph/debug/ge_attr_define.h" #include "graph/compute_graph.h" #include "graph/op_desc.h" #include "graph/optimize/common/params.h" @@ -96,29 +96,19 @@ Status SubGraphInfo::FreeInOutBuffer() { } } -GraphModelListener::GraphModelListener() : result_code_(0), is_finished_(false), mutex_(nullptr), condition_(nullptr) {} - -Status GraphModelListener::SetCondition(std::mutex *mutex, std::condition_variable *cond) { - if (mutex == nullptr || cond == nullptr) { - GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphManager] param is NULL."); - return GE_GRAPH_PARAM_NULLPTR; - } - - mutex_ = mutex; - condition_ = cond; - return SUCCESS; -} +GraphModelListener::GraphModelListener(std::mutex &mutex, std::condition_variable &cond) + : result_code_(0), is_finished_(false), mutex_(mutex), condition_(cond) {} Status GraphModelListener::OnComputeDone(uint32_t model_id, uint32_t task_id, uint32_t result) { GELOGI( "[GraphManager] graph compute call back, model_id:%u, task_id:%u, " "resultCode:%u.", model_id, task_id, result); - GE_IF_BOOL_EXEC(condition_ == nullptr, GELOGE(FAILED, "[GraphModelListener] condition is null."); return FAILED); - std::lock_guard lock(*mutex_); + + std::lock_guard lock(mutex_); result_code_ = result; is_finished_ = true; - condition_->notify_all(); + condition_.notify_all(); return SUCCESS; } @@ -132,12 +122,7 @@ uint32_t GraphModelListener::GetResultCode() const { } Status GraphModelListener::ResetResult() { - if (mutex_ == nullptr) { - GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphManager] param is NULL."); - return GE_GRAPH_PARAM_NULLPTR; - } - - std::lock_guard lock(*mutex_); + std::lock_guard lock(mutex_); result_code_ = 0; is_finished_ = false; diff --git a/src/ge/graph/manager/graph_manager_utils.h b/src/ge/graph/manager/graph_manager_utils.h index 7c6e9be1..ca33aba1 100644 --- a/src/ge/graph/manager/graph_manager_utils.h +++ b/src/ge/graph/manager/graph_manager_utils.h @@ -113,6 +113,7 @@ class SubGraphInfo { }; using SubGraphInfoPtr = std::shared_ptr; +using Graph2SubGraphInfoList = std::unordered_map>; // for run graph async listener class RunAsyncListener : public ge::ModelListener { @@ -184,15 +185,13 @@ using ConstGraphNodePtr = shared_ptr; class GraphModelListener : public ge::ModelListener { public: - GraphModelListener(); + GraphModelListener(std::mutex &mutex, std::condition_variable &cond); ~GraphModelListener() = default; // callback Status OnComputeDone(uint32_t model_id, uint32_t task_id, uint32_t result) override; - Status SetCondition(std::mutex *mutex, std::condition_variable *cond); - Status ResetResult(); // need lock by caller @@ -205,9 +204,9 @@ class GraphModelListener : public ge::ModelListener { bool is_finished_; // not owner - std::mutex *mutex_; + std::mutex &mutex_; // not owner - std::condition_variable *condition_; + std::condition_variable &condition_; }; Status ParseOutNodes(const string &out_nodes); @@ -236,7 +235,7 @@ struct GraphManagerOptions { std::map stream_max_parallel_num; std::string output_datatype; std::string original_model_file; - bool save_original_model; + std::string save_original_model; GraphManagerOptions() : stream_num(1), perf_level(domi::GEN_TASK_WITHOUT_FUSION), @@ -257,7 +256,7 @@ struct GraphManagerOptions { local_fmk_op_flag(false), hcom_parallel(false), enable_print_op_pass(true), - save_original_model(false) {} + save_original_model("false") {} }; } // namespace ge diff --git a/src/ge/graph/manager/graph_mem_allocator.cc b/src/ge/graph/manager/graph_mem_allocator.cc index 25fc5eb2..f01a0b4b 100644 --- a/src/ge/graph/manager/graph_mem_allocator.cc +++ b/src/ge/graph/manager/graph_mem_allocator.cc @@ -60,6 +60,7 @@ uint8_t *MemoryAllocator::MallocMemory(uint64_t memory_size, uint32_t device_id) } GELOGI("MemoryAllocator::MallocMemory device_id = %u, size= %lu", device_id, memory_size); + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "malloc function.", memory_size) return memory_addr; } @@ -102,9 +103,9 @@ Status MemoryAllocator::FreeMemory(const string &memory_key, uint32_t device_id) if (it == memory_base_map_.end()) { if (mem_malloced_) { GELOGW( - "MemoryAllocator::FreeMemory failed," - " memory_key[%s] was not exist, device_id = %u.", - memory_key.c_str(), device_id); + "MemoryAllocator::FreeMemory failed," + " memory_key[%s] was not exist, device_id = %u.", + memory_key.c_str(), device_id); } return ge::INTERNAL_ERROR; } @@ -136,9 +137,9 @@ uint8_t *MemoryAllocator::GetMemoryAddr(const string &memory_key, uint32_t devic auto it = memory_base_map_.find(memory_key); if (it == memory_base_map_.end()) { GELOGW( - "MemoryAllocator::GetMemoryAddr failed," - " memory_key[%s] was not exist, device_id = %u.", - memory_key.c_str(), device_id); + "MemoryAllocator::GetMemoryAddr failed," + " memory_key[%s] was not exist, device_id = %u.", + memory_key.c_str(), device_id); return nullptr; } diff --git a/src/ge/graph/manager/graph_var_manager.cc b/src/ge/graph/manager/graph_var_manager.cc index c0117bdf..d5ffbd03 100644 --- a/src/ge/graph/manager/graph_var_manager.cc +++ b/src/ge/graph/manager/graph_var_manager.cc @@ -19,11 +19,11 @@ #include #include "common/l2_cache_optimize.h" -#include "graph/debug/ge_attr_define.h" #include "common/types.h" #include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" #include "ge/ge_api_types.h" +#include "graph/debug/ge_attr_define.h" #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/trans_var_data_utils.h" #include "graph/utils/attr_utils.h" @@ -67,6 +67,7 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens void VarResource::SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr, rtMemType_t memory_type) { std::string var_key = VarKey(var_name, tensor_desc); + GELOGI("VarResource::SetVarAddr , var_key = %s, mem_type:%u", var_key.c_str(), memory_type); if (var_addr_mgr_map_.count(var_key) == 0) { GELOGI("SetVarAddr node_name %s, tensor_desc type %s, format %s", var_name.c_str(), TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), @@ -88,6 +89,9 @@ ge::Status VarResource::SaveVarAddr(const std::string &var_name, const ge::GeTen if (var_addr_mgr_map_.count(var_key) == 0) { uint64_t logic_address = VarManager::Instance(0)->GetVarMemLogicBase() + reinterpret_cast(reinterpret_cast(address)); + GELOGI("SaveVarAddr node_name %s, tensor_desc format %s, type %s.", var_name.c_str(), + TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(), + TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str()); VarAddrMgr var_addr_mgr; var_addr_mgr.address = reinterpret_cast(reinterpret_cast(logic_address)); var_addr_mgr.offset = reinterpret_cast(reinterpret_cast(address)); @@ -257,7 +261,7 @@ MemResource::MemResource() : total_size_(0), var_mem_size_(0) {} Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uint64_t session_id, size_t &mem_offset) { size = (size + kSessionMemAlignSize - 1) / kSessionMemAlignSize * kSessionMemAlignSize; - + uint64_t real_size = size; total_size_ = VarManager::Instance(0)->GetVarMemMaxSize(); if (total_size_ < var_mem_size_) { GELOGE(PARAM_INVALID, "total_size_: %lu is smaller than var_mem_size_: %lu", total_size_, var_mem_size_); @@ -265,7 +269,8 @@ Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uin } uint64_t free_size = total_size_ - var_mem_size_; if (free_size < (size + kSessionMemAlignSize * 2)) { - GELOGE(PARAM_INVALID, "malloc var mem, size[%lu] > free_size[%lu]", size, free_size); + GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]", + size + kSessionMemAlignSize * 2 + var_mem_size_, total_size_); return PARAM_INVALID; } @@ -403,8 +408,13 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, rtMemType_t memory_type) { std::lock_guard lock(mutex_); + GELOGI( + "VarManager::AssignVarMem var_name = %s, data_type = %s, data_format = " + "%s.", + var_name.c_str(), ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str()); - uint32_t tensor_desc_size = 0; + int64_t tensor_desc_size = 0; size_t mem_offset = 0; ge::Status result = TensorUtils::GetSize(tensor_desc, tensor_desc_size); if (result != ge::SUCCESS) { @@ -465,13 +475,14 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (cur_tensor_desc.GetFormat() != tensor_desc.GetFormat() || cur_tensor_desc.GetDataType() != tensor_desc.GetDataType() || cur_tensor_desc.GetShape().GetDims() != tensor_desc.GetShape().GetDims()) { - GELOGI("var %s assigned new memory (format, data type, shape) (%s, %s, %zu) from (%s, %s, %zu)", var_name.c_str(), - ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), - ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(), - tensor_desc.GetShape().GetDims().size(), - ge::TypeUtils::DataTypeToSerialString(cur_tensor_desc.GetDataType()).c_str(), - ge::TypeUtils::FormatToSerialString(cur_tensor_desc.GetFormat()).c_str(), - cur_tensor_desc.GetShape().GetDims().size()); + GELOGI( + "var %s assigned new memory (format, data type, shape) (%s, %s, " + "%zu) from (%s, %s, %zu)", + var_name.c_str(), ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(), tensor_desc.GetShape().GetDims().size(), + ge::TypeUtils::DataTypeToSerialString(cur_tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(cur_tensor_desc.GetFormat()).c_str(), + cur_tensor_desc.GetShape().GetDims().size()); var_resource_->SetVarAddr(var_name, tensor_desc, reinterpret_cast(reinterpret_cast(mem_offset)), memory_type); } diff --git a/src/ge/graph/manager/trans_var_data_utils.cc b/src/ge/graph/manager/trans_var_data_utils.cc index 9873ffb2..6109b120 100644 --- a/src/ge/graph/manager/trans_var_data_utils.cc +++ b/src/ge/graph/manager/trans_var_data_utils.cc @@ -28,10 +28,10 @@ namespace ge { Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, - uint8_t *dst_addr, uint32_t dst_addr_size, uint64_t session_id) { + uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) { GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "dst addr is null. "); uint8_t *src_host_addr = nullptr; - uint32_t src_addr_size = 0; + int64_t src_addr_size = 0; GE_MAKE_GUARD_RTMEM(src_host_addr); GE_CHK_STATUS_RET(SyncTensorToHost(var_name, src_tensor_desc, &src_host_addr, src_addr_size, session_id)); @@ -42,7 +42,7 @@ Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge return SUCCESS; } -Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, uint32_t src_addr_size, const string &var_name, +Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { GE_CHK_BOOL_RET_STATUS(src_addr != nullptr, FAILED, "src addr is null. "); uint8_t *host_addr = nullptr; @@ -51,26 +51,26 @@ Status TransVarDataUtils::SyncBroadCastData2Var(uint8_t *src_addr, uint32_t src_ GE_CHK_RT_RET(rtMemcpy(host_addr, src_addr_size, src_addr, src_addr_size, RT_MEMCPY_DEVICE_TO_HOST)); GE_CHK_STATUS_RET( - SyncTensorToDevice(var_name, reinterpret_cast(host_addr), src_addr_size, dst_tensor_desc, session_id)); + SyncTensorToDevice(var_name, reinterpret_cast(host_addr), src_addr_size, dst_tensor_desc, session_id)); return SUCCESS; } Status TransVarDataUtils::SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, - uint8_t **host_addr, uint32_t &src_tensor_size, uint64_t session_id) { + uint8_t **host_addr, int64_t &src_tensor_size, uint64_t session_id) { GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(src_tensor_desc, src_tensor_size), "get size from TensorDesc failed"); uint8_t *src_addr = nullptr; GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, src_tensor_desc, &src_addr)); - uint8_t *mem_addr = - src_addr - static_cast(reinterpret_cast(VarManager::Instance(0)->GetVarMemLogicBase())) + - static_cast( - reinterpret_cast(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM))); + uint8_t *mem_addr = src_addr - + static_cast(reinterpret_cast(VarManager::Instance(0)->GetVarMemLogicBase())) + + static_cast( + reinterpret_cast(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM))); GE_CHK_RT_RET(rtMallocHost(reinterpret_cast(host_addr), src_tensor_size)); GE_CHK_RT_RET(rtMemcpy(*host_addr, src_tensor_size, mem_addr, src_tensor_size, RT_MEMCPY_DEVICE_TO_HOST)); - GELOGI("SyncTensorToHost var_name %s, src_tensor_size %u", var_name.c_str(), src_tensor_size); + GELOGI("SyncTensorToHost var_name %s, src_tensor_size %ld", var_name.c_str(), src_tensor_size); return SUCCESS; } @@ -78,10 +78,10 @@ Status TransVarDataUtils::SyncTensorToDevice(const string &var_name, const uint8 const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id) { uint8_t *dst_addr = nullptr; GE_CHK_STATUS_RET(VarManager::Instance(session_id)->GetVarAddr(var_name, dst_tensor_desc, &dst_addr)); - uint8_t *mem_addr = - dst_addr - static_cast(reinterpret_cast(VarManager::Instance(0)->GetVarMemLogicBase())) + - static_cast( - reinterpret_cast(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM))); + uint8_t *mem_addr = dst_addr - + static_cast(reinterpret_cast(VarManager::Instance(0)->GetVarMemLogicBase())) + + static_cast( + reinterpret_cast(VarManager::Instance(session_id)->GetVarMemoryBase(RT_MEMORY_HBM))); GE_CHK_RT_RET(rtMemcpy(mem_addr, addr_size, host_addr, addr_size, RT_MEMCPY_HOST_TO_DEVICE)); GELOGI("SyncTensorToDevice var_name %s, addr_size %u", var_name.c_str(), addr_size); diff --git a/src/ge/graph/manager/trans_var_data_utils.h b/src/ge/graph/manager/trans_var_data_utils.h index 213dad12..69521dab 100644 --- a/src/ge/graph/manager/trans_var_data_utils.h +++ b/src/ge/graph/manager/trans_var_data_utils.h @@ -27,14 +27,13 @@ namespace ge { class TransVarDataUtils { public: static ge::Status SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, - uint8_t *dst_addr, uint32_t dst_addr_size, uint64_t session_id_); - static ge::Status SyncBroadCastData2Var(uint8_t *src_addr, uint32_t src_addr_size, const string &var_name, + uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id_); + static ge::Status SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name, const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id_); private: static ge::Status SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc, - uint8_t **host_addr, uint32_t &addr_size, uint64_t session_id_); - + uint8_t **host_addr, int64_t &addr_size, uint64_t session_id_); static ge::Status SyncTensorToDevice(const string &var_name, const uint8_t *host_addr, uint32_t addr_size, const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id_); }; diff --git a/src/ge/graph/manager/util/debug.cc b/src/ge/graph/manager/util/debug.cc index 96420638..b2ef1c92 100644 --- a/src/ge/graph/manager/util/debug.cc +++ b/src/ge/graph/manager/util/debug.cc @@ -54,7 +54,7 @@ void Debug::DumpProto(const Message &proto, const char *file) { } } -Status Debug::DumpDevMem(const char *file, const void *addr, uint32_t size) { +Status Debug::DumpDevMem(const char *file, const void *addr, int64_t size) { if (size == 0) { GELOGI("Dump data failed because the size is 0."); return SUCCESS; diff --git a/src/ge/graph/manager/util/debug.h b/src/ge/graph/manager/util/debug.h index 8742db4f..9cedb680 100644 --- a/src/ge/graph/manager/util/debug.h +++ b/src/ge/graph/manager/util/debug.h @@ -55,7 +55,7 @@ class Debug { ~Debug(); static void DumpProto(const Message &proto, const char *file); - static Status DumpDevMem(const char *file, const void *addr, uint32_t size); + static Status DumpDevMem(const char *file, const void *addr, int64_t size); }; } // namespace ge diff --git a/src/ge/graph/manager/util/hcom_util.cc b/src/ge/graph/manager/util/hcom_util.cc index a9eb323d..a1c4d769 100644 --- a/src/ge/graph/manager/util/hcom_util.cc +++ b/src/ge/graph/manager/util/hcom_util.cc @@ -74,33 +74,33 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType return PARAM_INVALID; } ge::GeShape shape = ge::GeShape(shape_dims); - auto input_size = static_cast(shape.GetShapeSize() * size); + int64_t input_size = shape.GetShapeSize() * size; total_size = (input_size + align_size - 1) / align_size * align_size; } else { for (size_t i = 0; i < op_desc->GetInputsSize(); i++) { - uint32_t input_size = 0; + int64_t input_size = 0; int64_t block_size = 0; GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i)); GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size), "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i); GE_IF_BOOL_EXEC( - op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; - GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, - "get HCOM_ATTR_RANK_SIZE failed"); - GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "rank size is zero"); - int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET( - CheckInt64Int32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX"); - block_size = (shape_size * size) / rank_size; - GE_CHK_STATUS_RET(CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); - total_size = total_size + block_size; continue;); + op_desc->GetType() == HCOMREDUCESCATTER, int32_t rank_size = 0; + GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_RANK_SIZE, rank_size), PARAM_INVALID, + "get HCOM_ATTR_RANK_SIZE failed"); + GE_CHK_BOOL_RET_STATUS(rank_size != 0, PARAM_INVALID, "rank size is zero"); + int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); GE_CHK_STATUS_RET( + ge::CheckInt64Uint32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX"); + block_size = (shape_size * size) / rank_size; + GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); + total_size = total_size + block_size; continue;); int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize(); - GE_CHK_STATUS_RET(CheckInt64Int32MulOverflow(shape_size, size), + GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size), "Product of shape size and size beyond INT64_MAX"); GE_IF_BOOL_EXEC(is_allgather, block_size = shape_size * size;); GE_IF_BOOL_EXEC(!is_allgather, block_size = (input_size + align_size - 1) / align_size * align_size;); - GE_CHK_STATUS_RET(CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); + GE_CHK_STATUS_RET(ge::CheckInt64AddOverflow(total_size, block_size), "Total size is beyond the INT64_MAX"); total_size = total_size + block_size; } } diff --git a/src/ge/graph/optimize/common/params.h b/src/ge/graph/optimize/common/params.h index 71151c1d..ee2a735b 100644 --- a/src/ge/graph/optimize/common/params.h +++ b/src/ge/graph/optimize/common/params.h @@ -40,9 +40,6 @@ class Params : public Singleton { if (tmp_target == "mini") { target_ = "MINI"; target_8bit_ = TARGET_TYPE_MINI_8BIT; - } else if (tmp_target == "tiny") { - target_ = "TINY"; - target_8bit_ = TARGET_TYPE_TINY_8BIT; } else if (tmp_target == "lite") { target_ = "LITE"; target_8bit_ = TARGET_TYPE_LTTE_8BIT; @@ -63,4 +60,3 @@ class Params : public Singleton { } // namespace ge #endif // GE_GRAPH_OPTIMIZE_COMMON_PARAMS_H_ - diff --git a/src/ge/graph/optimize/graph_optimize.cc b/src/ge/graph/optimize/graph_optimize.cc index 849ad296..0be0aeee 100644 --- a/src/ge/graph/optimize/graph_optimize.cc +++ b/src/ge/graph/optimize/graph_optimize.cc @@ -30,6 +30,7 @@ using ge::ComputeGraph; using ge::OpDesc; namespace { +const char *const kVectorCore = "VectorCore"; const char *const kVectorEngine = "VectorEngine"; const char *const kAicoreEngine = "AIcoreEngine"; } // namespace @@ -40,7 +41,7 @@ GraphOptimize::GraphOptimize() cal_config_(""), insert_op_config_(""), parse_out_node_(""), - core_type_(kAicoreEngine), + core_type_(""), graph_context_(nullptr) {} void AddNodeInputProperty(ComputeGraphPtr &compute_graph) { @@ -140,7 +141,7 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) { std::map graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjs(); GELOGI("optimize by opskernel in original graph optimize phase. num of graph_optimizer is %lu.", graph_optimizer.size()); - string exclude_core_Type = (core_type_ == kVectorEngine) ? kAicoreEngine : kVectorEngine; + string exclude_core_Type = (core_type_ == kVectorCore) ? kAicoreEngine : kVectorEngine; GELOGD("[OptimizeOriginalGraph]: engine type will exclude: %s", exclude_core_Type.c_str()); if (graph_optimizer.size() != 0) { for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { @@ -173,7 +174,7 @@ Status GraphOptimize::OptimizeOriginalGraphForQuantize(ComputeGraphPtr &compute_ GELOGI("optimize by opskernel in original graph optimize quantize phase. num of graph_optimizer is %zu.", graph_optimizer.size()); Status ret = SUCCESS; - string exclude_core_Type = (core_type_ == kVectorEngine) ? kAicoreEngine : kVectorEngine; + string exclude_core_Type = (core_type_ == kVectorCore) ? kAicoreEngine : kVectorEngine; GELOGD("[OptimizeOriginalGraphForQuantize]: engine type will exclude: %s", exclude_core_Type.c_str()); if (graph_optimizer.size() != 0) { for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) { diff --git a/src/ge/graph/optimize/optimizer/graph_pass.h b/src/ge/graph/optimize/optimizer/graph_pass.h deleted file mode 100644 index 7393fd43..00000000 --- a/src/ge/graph/optimize/optimizer/graph_pass.h +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GE_GRAPH_OPTIMIZE_OPTIMIZER_GRAPH_PASS_H_ -#define GE_GRAPH_OPTIMIZE_OPTIMIZER_GRAPH_PASS_H_ - -#include -#include - -#include "./pass.h" -#include "common/op/attr_value_util.h" -#include "common/op/ge_op_utils.h" -#include "framework/common/debug/ge_log.h" -#include "framework/common/ge_inner_error_codes.h" -#include "graph/compute_graph.h" -#include "graph/utils/attr_utils.h" -#include "graph/utils/graph_utils.h" - -namespace ge { -/// -/// @ingroup domi -/// @brief -/// @author -/// -class GraphPass : public Pass { - public: - /// - /// @param [in] graph graph to optimize - /// @return SUCCESS optimize success - /// @return NOT_CHANGED not optimized - /// @return other optimize fail - /// @author - /// - virtual Status Run(ge::ComputeGraphPtr graph) = 0; - static void RecordOriginalNames(std::vector originalNodes, const ge::NodePtr &node) { - GE_CHECK_NOTNULL_JUST_RETURN(node); - std::vector originalNames; - for (ge::NodePtr nodeTmp : originalNodes) { - GE_IF_BOOL_EXEC(nodeTmp == nullptr, return;) - std::vector namesTmp; - ge::OpDescPtr opdescTmp = nodeTmp->GetOpDesc(); - if (!ge::AttrUtils::GetListStr(opdescTmp, "original_op_names", namesTmp)) { - GELOGW("Get original_op_names failed"); - } - if (namesTmp.size() != 0) { - originalNames.insert(originalNames.end(), namesTmp.begin(), namesTmp.end()); - } else { - originalNames.emplace_back(opdescTmp->GetName()); - } - } - - if (originalNames.size() == 0) { - std::string tmp; - originalNames.emplace_back(tmp); - } - - GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(node->GetOpDesc(), "_datadump_original_op_names", originalNames), - return, "Set original_op_names fail.") - } - - static bool IsConstNode(const ge::NodePtr &node) { - if (node == nullptr) { - GELOGE(PARAM_INVALID, "Input param node is nullptr."); - return false; - } - if (node->GetOpDesc()->GetType() == CONSTANTOP) { - return true; - } else if (node->GetOpDesc()->GetType() == FRAMEWORKOP) { - string type; - GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type), - return false, "Get original_type for op %s fail!", node->GetName().c_str()); - GE_IF_BOOL_EXEC(type == CONSTANT, GELOGI("Is const op"); return true); - return false; - } else { - return false; - } - } -}; -} // namespace ge -#endif // GE_GRAPH_OPTIMIZE_OPTIMIZER_GRAPH_PASS_H_ diff --git a/src/ge/graph/partition/engine_place.cc b/src/ge/graph/partition/engine_place.cc index eb8a0f11..74da0326 100644 --- a/src/ge/graph/partition/engine_place.cc +++ b/src/ge/graph/partition/engine_place.cc @@ -71,7 +71,7 @@ Status EnginePlacer::AssignEngineAndLog(ge::ConstNodePtr node_ptr, const std::st } // private function, promise node_ptr->GetOpDesc() not null - GELOGD("Assigning DNNEngine %s to node %s, op type is %s", engine_name.c_str(), node_ptr->GetName().c_str(), + GELOGD("Assigning DNNEngine %s to node %s, op type %s", engine_name.c_str(), node_ptr->GetName().c_str(), node_ptr->GetOpDesc()->GetType().c_str()); // Record the node assigned engine name diff --git a/src/ge/graph/partition/engine_place.h b/src/ge/graph/partition/engine_place.h index 93a101ff..8a3e83a5 100644 --- a/src/ge/graph/partition/engine_place.h +++ b/src/ge/graph/partition/engine_place.h @@ -24,7 +24,7 @@ #include "graph/compute_graph.h" namespace ge { -using NodeEngineMap = std::unordered_map; +using NodeEngineMap = std::unordered_map; /// /// @ingroup graph/partition @@ -37,12 +37,6 @@ class EnginePlacer { EnginePlacer() = default; ~EnginePlacer() = default; - // Disable copy constructor and assignment operator - EnginePlacer(const EnginePlacer &) = delete; - EnginePlacer(const EnginePlacer &&) = delete; - EnginePlacer &operator=(const EnginePlacer &) = delete; - EnginePlacer &operator=(const EnginePlacer &&) = delete; - Status Run(); // Get the unique node-engine map diff --git a/src/ge/graph/partition/graph_partition.cc b/src/ge/graph/partition/graph_partition.cc index 2324c6e0..feced331 100644 --- a/src/ge/graph/partition/graph_partition.cc +++ b/src/ge/graph/partition/graph_partition.cc @@ -22,10 +22,9 @@ #include #include "common/ge/ge_util.h" #include "common/op/ge_op_utils.h" -#include "graph/debug/ge_attr_define.h" #include "framework/common/types.h" +#include "graph/debug/ge_attr_define.h" #include "graph/manager/graph_manager_utils.h" -#include "graph/optimize/optimizer/graph_pass.h" #include "graph/utils/graph_utils.h" #include "graph/utils/op_desc_utils.h" #include "graph/utils/type_utils.h" @@ -43,20 +42,14 @@ const int kRankZero = 0; // order of graph list is 0,1,2,3..., 0 means first or namespace ge { Status ge::GraphPartitioner::CheckIfEnd2PldEmpty(ge::ComputeGraphPtr &output_merged_compute_graph) { // only one condition:no data node, one engine, there is only one graph + input graph - if (partitions_.size() == kOneGraph) { - auto partition = (*partitions_.begin()); + if (graph_info_.partitions_.size() == kOneGraph) { + auto partition = (*graph_info_.partitions_.begin()); if (partition.first == nullptr) { GELOGE(GE_GRAPH_EMPTY_PARTITION, "[GraphPartitioner]: partition.first is null, engine name is %s", partition.second.c_str()); return FAILED; } output_merged_compute_graph = partition.first; - // flush all nodes' engine of merged graph - engine_placer_.SetComputeGraph(output_merged_compute_graph); - if (engine_placer_.Run() != SUCCESS) { - GELOGE(GE_GRAPH_INIT_FAILED, "[GraphPartitioner]: engine_placer run failed"); - return FAILED; - } } else { // if placeholder to end map is empty, it should be an exception condition GELOGE(GE_GRAPH_EMPTY_PARTITION, "[GraphPartitioner]: placeholder to end map is empty, partitions size is not 1."); return FAILED; @@ -66,13 +59,13 @@ Status ge::GraphPartitioner::CheckIfEnd2PldEmpty(ge::ComputeGraphPtr &output_mer Status ge::GraphPartitioner::MergeAllSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph, const std::vector &sub_graph_list) { - for (size_t rank = 0; rank < rank_2_partitions_.size(); rank++) { + for (size_t rank = 0; rank < graph_info_.rank_2_partitions_.size(); rank++) { string temp_stream; // sub_graph_list index is one ahead of rank_2_partitions_list index if (rank > 0) { temp_stream = sub_graph_list[rank - 1]->GetStreamLabel(); } - for (const auto &node : rank_2_partitions_[rank]->GetAllNodes()) { + for (const auto &node : graph_info_.rank_2_partitions_[rank]->GetDirectNode()) { if (node == nullptr) { continue; } @@ -97,8 +90,8 @@ Status ge::GraphPartitioner::MergeAllSubGraph(ge::ComputeGraphPtr &output_merged void ge::GraphPartitioner::SetMergedGraphId(ge::ComputeGraphPtr &output_merged_compute_graph) { string session_graph_id; // get session graph id from subgraph - if (rank_2_partitions_.empty() || - !AttrUtils::GetStr(*(rank_2_partitions_[0]), ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) { + if (graph_info_.rank_2_partitions_.empty() || + !AttrUtils::GetStr(*(graph_info_.rank_2_partitions_[0]), ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) { GELOGW("Get graph session_graph_id attr failed."); } // set session graph id into merged subgraph @@ -118,9 +111,9 @@ Status ge::GraphPartitioner::RemoveNodeAndEdgeBetweenEndPld(ge::ComputeGraphPtr GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MergeAllSubGraph failed."); return FAILED; } - for (const auto &it : index_2_end_) { + for (const auto &it : graph_info_.index_2_end_) { auto &end = it.second; - auto &pld = end_2_pld_[it.second]; + auto &pld = graph_info_.end_2_pld_[it.second]; if ((end != nullptr) && (pld != nullptr) && (end->GetInDataAnchor(0) != nullptr) && (pld->GetOutDataAnchor(0) != nullptr)) { AnchorPtr end_in_anchor = (end->GetInDataAnchor(0)->GetFirstPeerAnchor() == nullptr) @@ -154,28 +147,67 @@ Status ge::GraphPartitioner::RemoveNodeAndEdgeBetweenEndPld(ge::ComputeGraphPtr return SUCCESS; } -void ge::GraphPartitioner::ClearAllPartitionData(Mode mode) { - transfer_graph_.clear(); - rank_2_partitions_.clear(); - partitions_2_rank_.clear(); - partitions_.clear(); - corresponding_node_in_partitions_.clear(); - index_2_end_.clear(); - cluster_2_partition_.clear(); - clusters_.clear(); - node_2_cluster_.clear(); - pld_2_end_.clear(); - end_2_pld_.clear(); - if (mode_ == kMerging) { - mode_ = kPartitioning; - } else { - mode_ = mode; +Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr &output_merged_compute_graph, + const ge::ComputeGraphPtr &original_compute_graph) { + auto ret = MergeSubGraph(output_merged_compute_graph, original_compute_graph); + if (ret != SUCCESS) { + GELOGE(ret, "Graph merging Failed"); + return ret; } + // partition sub graph + for (const auto &sub_graph : original_compute_graph->GetAllSubgraphs()) { + ComputeGraphPtr merged_sub_graph = nullptr; + ret = MergeSubGraph(merged_sub_graph, sub_graph); + if (ret != SUCCESS) { + GELOGE(ret, "Sub graph merging Failed"); + return ret; + } + // add sub graph + output_merged_compute_graph->SetName(original_compute_graph->GetName()); + merged_sub_graph->SetName(sub_graph->GetName()); + merged_sub_graph->SetInputSize(sub_graph->GetInputSize()); + merged_sub_graph->SetOutputSize(sub_graph->GetOutputSize()); + auto parent_node = sub_graph->GetParentNode(); + GE_IF_BOOL_EXEC(parent_node == nullptr, + GELOGE(FAILED, "Parent node is null, graph name is %s", sub_graph->GetName().c_str()); + return FAILED;) + auto original_graph = parent_node->GetOwnerComputeGraph(); + GE_IF_BOOL_EXEC(graph_2_graph_partition_info_.find(original_graph) == graph_2_graph_partition_info_.end(), + GELOGE(FAILED, "Find graph info failed, graph name is %s", original_graph->GetName().c_str()); + return FAILED;) + auto graph_info = graph_2_graph_partition_info_[original_graph]; + GE_IF_BOOL_EXEC( + graph_info.corresponding_node_in_partitions_.find(parent_node) == + graph_info.corresponding_node_in_partitions_.end(), + GELOGE(FAILED, "Find corresponding node failed, parent node name is %s", parent_node->GetName().c_str()); + return FAILED;) + auto corresponding_node = graph_info.corresponding_node_in_partitions_[parent_node]; + merged_sub_graph->SetParentNode(corresponding_node); + merged_sub_graph->SetParentGraph(output_merged_compute_graph); + ret = output_merged_compute_graph->AddSubgraph(sub_graph->GetName(), merged_sub_graph); + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, return ret;) + } + graph_2_graph_partition_info_.clear(); + graph_2_subgraph_list_.clear(); + return SUCCESS; } -Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr &output_merged_compute_graph, - const std::vector &sub_graph_list) { - if (mode_ != kMerging) { +Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph, + const ge::ComputeGraphPtr &original_compute_graph) { + if (original_compute_graph == nullptr) { + GELOGE(GE_GRAPH_NULL_INPUT, "[GraphPartitioner]: compute_graph is null."); + return FAILED; + } + if ((graph_2_graph_partition_info_.find(original_compute_graph) == graph_2_graph_partition_info_.end()) || + (graph_2_subgraph_list_.find(original_compute_graph) == graph_2_subgraph_list_.end())) { + GELOGE(GE_GRAPH_NULL_INPUT, "[GraphPartitioner]: compute_graph is error."); + return FAILED; + } + GraphPartitionInfo &subgraph_info = graph_2_graph_partition_info_[original_compute_graph]; + const auto &sub_graph_list = graph_2_subgraph_list_[original_compute_graph]; + graph_info_ = subgraph_info; + + if (graph_info_.mode_ != kMerging) { GELOGE(GE_GRAPH_UNSUPPORTED, "Cannot call merging in partition mode"); return FAILED; } @@ -187,7 +219,8 @@ Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr return FAILED; } } - if (end_2_pld_.empty() || pld_2_end_.empty()) { + bool is_map_empty = graph_info_.end_2_pld_.empty() || graph_info_.pld_2_end_.empty(); + if (is_map_empty) { if (CheckIfEnd2PldEmpty(output_merged_compute_graph) != SUCCESS) { return FAILED; } @@ -205,15 +238,6 @@ Status ge::GraphPartitioner::MergeAfterSubGraphOptimization(ge::ComputeGraphPtr return FAILED; } GE_TIMESTAMP_END(MergeGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting"); - // flush all nodes' engine of merged graph - GE_TIMESTAMP_START(MergeGraphEnginePlacerRun); - engine_placer_.SetComputeGraph(output_merged_compute_graph); - if (engine_placer_.Run() != SUCCESS) { - GELOGE(GE_GRAPH_INIT_FAILED, "[GraphPartitioner]: engine_placer run failed"); - return FAILED; - } - GE_TIMESTAMP_END(MergeGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun"); - ClearAllPartitionData(kMerging); GELOGI("Graph merge ends."); return SUCCESS; } @@ -297,18 +321,19 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr string engine_end_name; string engine_pld_name; // link input -> end - string end_name = kEndType + std::to_string(num_of_pld_end_); + string end_name = kEndType + std::to_string(graph_info_.num_of_pld_end_); auto end_op_desc = MakeShared(end_name, END); if (end_op_desc == nullptr) { GELOGE(GRAPH_PARAM_INVALID, "pld_op_desc is nullptr."); return FAILED; } - GE_IF_BOOL_EXEC(!AttrUtils::SetInt(end_op_desc, "peerIndex", num_of_pld_end_), GELOGW("SetInt peerIndex failed");) + GE_IF_BOOL_EXEC(!AttrUtils::SetInt(end_op_desc, "peerIndex", graph_info_.num_of_pld_end_), + GELOGW("SetInt peerIndex failed");) GE_IF_BOOL_EXEC(!AttrUtils::SetStr(end_op_desc, "parentOpType", dst_node->GetType()), GELOGW("SetStr parentOpType failed");) // replace input_desc of end with owner node's desc int input_index = ge::AnchorUtils::GetIdx(peer_in_anchor); - bool is_need_update_desc = (input_index >= 0) && (mode_ == kPartitioning); + bool is_need_update_desc = (input_index >= 0) && (graph_info_.mode_ == kPartitioning); if (is_need_update_desc) { if (UpdateEndOpDesc(dst_node, input_index, end_op_desc) != SUCCESS) { GELOGE(GRAPH_PARAM_INVALID, "UpdateEndOpDesc failed, input index %d, engine name is %s", input_index, @@ -342,13 +367,14 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr auto const &src_node_opdesc = src_node->GetOpDesc(); GE_CHECK_NOTNULL(src_node_opdesc); int64_t node_id = src_node_opdesc->GetId(); - const string pld_name = kPlaceHolderType + std::to_string(num_of_pld_end_); + const string pld_name = kPlaceHolderType + std::to_string(graph_info_.num_of_pld_end_); auto pld_op_desc = MakeShared(pld_name, PLACEHOLDER); if (pld_op_desc == nullptr) { GELOGE(GRAPH_PARAM_INVALID, "pld_op_desc is nullptr."); return FAILED; } - GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "peerIndex", num_of_pld_end_), GELOGW("SetInt peerIndex failed");) + GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "peerIndex", graph_info_.num_of_pld_end_), + GELOGW("SetInt peerIndex failed");) GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentOpType", src_node->GetType()), GELOGW("SetStr parentOpType failed");) GE_IF_BOOL_EXEC(!AttrUtils::SetStr(pld_op_desc, "parentId", end_graph->GetName() + ":" + std::to_string(node_id)), @@ -356,10 +382,10 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr GE_IF_BOOL_EXEC(!AttrUtils::SetInt(pld_op_desc, "anchorIndex", AnchorUtils::GetIdx(out_anchor)), GELOGW("SetInt anchorIndex failed");) // do not care over flow - num_of_pld_end_++; + graph_info_.num_of_pld_end_++; // replace output_desc of pld with input node's output desc int output_index = ge::AnchorUtils::GetIdx(out_anchor); - is_need_update_desc = (output_index >= 0) && (mode_ == kPartitioning); + is_need_update_desc = (output_index >= 0) && (graph_info_.mode_ == kPartitioning); if (is_need_update_desc) { if (UpdatePldOpDesc(src_node, output_index, pld_op_desc) != SUCCESS) { GELOGE(GRAPH_PARAM_INVALID, "UpdateEndOpDesc failed, output index %d, engine name is %s", output_index, @@ -390,9 +416,9 @@ graphStatus ge::GraphPartitioner::AddPlaceHolderEndInSrcDstGraph(const AnchorPtr dst_node->GetName().c_str(), AnchorUtils::GetIdx(peer_in_anchor), pld_graph->GetName().c_str()); return FAILED; } - index_2_end_[num_of_pld_end_] = new_end_node; - pld_2_end_[new_pld_node] = new_end_node; - end_2_pld_[new_end_node] = new_pld_node; + graph_info_.index_2_end_[graph_info_.num_of_pld_end_] = new_end_node; + graph_info_.pld_2_end_[new_pld_node] = new_end_node; + graph_info_.end_2_pld_[new_end_node] = new_pld_node; return SUCCESS; } @@ -442,7 +468,7 @@ Status ge::GraphPartitioner::PutInputNodesInSubGraph(const ge::ComputeGraphPtr & GELOGE(FAILED, "parameter ptr is null."); return FAILED; } - for (auto &input_node : src_graph->GetAllNodes()) { + for (auto &input_node : src_graph->GetDirectNode()) { if (IsDataLike(input_node)) { if (input_node->SetOwnerComputeGraph(dst_graph) != GRAPH_SUCCESS) { GELOGE(FAILED, "[GraphPartitioner]: SetOwnerComputeGraph failed."); @@ -472,7 +498,7 @@ void ge::GraphPartitioner::AddNewGraphToPartition(ge::ComputeGraphPtr &input_gra GELOGW("[GraphPartitioner]: input_graph is null, engine name is %s", engine_name.c_str()); return; } - partitions_[input_graph] = engine_name; + graph_info_.partitions_[input_graph] = engine_name; } bool ge::GraphPartitioner::IsDataLike(ge::NodePtr node) { @@ -495,12 +521,12 @@ Status ge::GraphPartitioner::Initialize(ge::ComputeGraphPtr compute_graph) { GELOGE(GE_GRAPH_NOT_INIT, "Graph partitioner initialize failed."); return FAILED; } - engine_placer_.SetComputeGraph(compute_graph); - if (engine_placer_.Run() != SUCCESS) { + graph_info_.engine_placer_.SetComputeGraph(compute_graph); + if (graph_info_.engine_placer_.Run() != SUCCESS) { GELOGE(FAILED, "Engine placer run failed."); return FAILED; } - const NodeEngineMap *node_engine_map = engine_placer_.GetNodeEngineMap(); + const NodeEngineMap *node_engine_map = graph_info_.engine_placer_.GetNodeEngineMap(); size_t temp_index = 0; for (const auto &node : compute_graph->GetDirectNode()) { std::string temp_stream; @@ -523,12 +549,12 @@ Status ge::GraphPartitioner::Initialize(ge::ComputeGraphPtr compute_graph) { new_cluster->nodes_.push_back(node); if (!HasNoInput(node)) { for (const auto &parent : node->GetInAllNodes()) { - new_cluster->in_clu_.insert(node_2_cluster_.at(parent)->index_); - node_2_cluster_.at(parent)->out_clu_.insert(temp_index); + new_cluster->in_clu_.insert(graph_info_.node_2_cluster_.at(parent)->index_); + graph_info_.node_2_cluster_.at(parent)->out_clu_.insert(temp_index); } } - node_2_cluster_[node] = new_cluster; - clusters_[temp_index] = new_cluster; + graph_info_.node_2_cluster_[node] = new_cluster; + graph_info_.clusters_[temp_index] = new_cluster; GELOGD("Node name is %s, engine is %s, cluster index is %zu, stream label is %s", node->GetName().c_str(), new_cluster->engine_name_.c_str(), new_cluster->index_, new_cluster->stream_label_.c_str()); temp_index++; @@ -546,18 +572,20 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vectorGetName()); - GraphUtils::DumpGEGraphToOnnx(*subGraph, subGraph->GetName()); + auto &engine_name = graph_info_.partitions_.at(sub_graph); + GraphUtils::DumpGEGraph(sub_graph, sub_graph->GetName()); + GraphUtils::DumpGEGraphToOnnx(*sub_graph, sub_graph->GetName()); if (!session_graph_id.empty()) { - GE_IF_BOOL_EXEC(!AttrUtils::SetStr(subGraph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id), + GE_IF_BOOL_EXEC(!AttrUtils::SetStr(sub_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id), GELOGW("SetStr ATTR_NAME_SESSION_GRAPH_ID failed");) } + // flush parent node of subgraph + sub_graph->SetParentNode(compute_graph->GetParentNode()); if (engine_name != input_subgraph_name) { // do not add Data subGraph into SubGraphInfo auto sgi = MakeShared(); if (sgi == nullptr) { @@ -568,18 +596,20 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vectorSetEngineName(engine_name); // set stream label string sub_graph_stream; - if (AttrUtils::GetStr(subGraph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) { + if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) { sgi->SetStreamLabel(sub_graph_stream); } /// for now inputFlag is the same before and after partition. It should /// be changed according to the real partition - std::vector sub_graph_input(input_size_, true); - std::vector sub_graph_output(output_size_, true); - sgi->SetSubGraph(subGraph); + std::vector sub_graph_input(graph_info_.input_size_, true); + std::vector sub_graph_output(graph_info_.output_size_, true); + sgi->SetSubGraph(sub_graph); sgi->SetOutputFlag(sub_graph_output); sgi->SetInputFlag(sub_graph_input); - sgi->SetOutputContext(output_name_); + sgi->SetOutputContext(graph_info_.output_name_); AddEndPldInformationToSubGraphInfo(sgi); + GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", + engine_name.c_str(), sub_graph->GetName().c_str(), sgi->GetStreamLabel().c_str()); output_subgraphs.push_back(sgi); } } @@ -588,17 +618,18 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vectornodes_.empty()) || - (clusters_[child_cluster] == nullptr) || (clusters_[child_cluster]->nodes_.empty())) { + if ((graph_info_.clusters_[parent_cluster] == nullptr) || (graph_info_.clusters_[parent_cluster]->nodes_.empty()) || + (graph_info_.clusters_[child_cluster] == nullptr) || (graph_info_.clusters_[child_cluster]->nodes_.empty())) { return false; } // Check if parent_cluster,child_cluster has same engine or stream label - if ((clusters_[parent_cluster]->engine_name_ != clusters_[child_cluster]->engine_name_) || - (clusters_[parent_cluster]->stream_label_ != clusters_[child_cluster]->stream_label_)) { + if ((graph_info_.clusters_[parent_cluster]->engine_name_ != graph_info_.clusters_[child_cluster]->engine_name_) || + (graph_info_.clusters_[parent_cluster]->stream_label_ != graph_info_.clusters_[child_cluster]->stream_label_)) { GELOGD("Parent cluster %zu engine %s stream label %s, child cluster %zu engine %s stream label %s can not merge", - parent_cluster, clusters_[parent_cluster]->engine_name_.c_str(), - clusters_[parent_cluster]->stream_label_.c_str(), child_cluster, - clusters_[child_cluster]->engine_name_.c_str(), clusters_[child_cluster]->stream_label_.c_str()); + parent_cluster, graph_info_.clusters_[parent_cluster]->engine_name_.c_str(), + graph_info_.clusters_[parent_cluster]->stream_label_.c_str(), child_cluster, + graph_info_.clusters_[child_cluster]->engine_name_.c_str(), + graph_info_.clusters_[child_cluster]->stream_label_.c_str()); return false; } // Check if parent_cluster,child_cluster is reachable @@ -628,51 +659,52 @@ void ge::GraphPartitioner::MergeTwoClusters(size_t parent_cluster, size_t &child } // update node_2_cluster_ map - for (auto &node : clusters_[big_cluster]->nodes_) { - node_2_cluster_[node] = clusters_[small_cluster]; + for (auto &node : graph_info_.clusters_[big_cluster]->nodes_) { + graph_info_.node_2_cluster_[node] = graph_info_.clusters_[small_cluster]; } // merge nodes - clusters_[small_cluster]->nodes_.splice(clusters_[small_cluster]->nodes_.end(), clusters_[big_cluster]->nodes_); + graph_info_.clusters_[small_cluster]->nodes_.splice(graph_info_.clusters_[small_cluster]->nodes_.end(), + graph_info_.clusters_[big_cluster]->nodes_); // merge all input & output to small cluster - clusters_[small_cluster]->in_clu_.insert(clusters_[big_cluster]->in_clu_.begin(), - clusters_[big_cluster]->in_clu_.end()); - clusters_[small_cluster]->out_clu_.insert(clusters_[big_cluster]->out_clu_.begin(), - clusters_[big_cluster]->out_clu_.end()); + graph_info_.clusters_[small_cluster]->in_clu_.insert(graph_info_.clusters_[big_cluster]->in_clu_.begin(), + graph_info_.clusters_[big_cluster]->in_clu_.end()); + graph_info_.clusters_[small_cluster]->out_clu_.insert(graph_info_.clusters_[big_cluster]->out_clu_.begin(), + graph_info_.clusters_[big_cluster]->out_clu_.end()); // remove child_cluster's out parent_cluster's in between child_cluster and parent_cluster RemoveEdge(parent_cluster, child_cluster_original); // update in/out of the cluster with bigger index - for (auto in_clu : clusters_[big_cluster]->in_clu_) { - clusters_[in_clu]->out_clu_.insert(small_cluster); - clusters_[in_clu]->out_clu_.erase(big_cluster); + for (auto in_clu : graph_info_.clusters_[big_cluster]->in_clu_) { + graph_info_.clusters_[in_clu]->out_clu_.insert(small_cluster); + graph_info_.clusters_[in_clu]->out_clu_.erase(big_cluster); } - for (auto out_clu : clusters_[big_cluster]->out_clu_) { - clusters_[out_clu]->in_clu_.insert(small_cluster); - clusters_[out_clu]->in_clu_.erase(big_cluster); + for (auto out_clu : graph_info_.clusters_[big_cluster]->out_clu_) { + graph_info_.clusters_[out_clu]->in_clu_.insert(small_cluster); + graph_info_.clusters_[out_clu]->in_clu_.erase(big_cluster); } - clusters_[big_cluster] = clusters_[small_cluster]; + graph_info_.clusters_[big_cluster] = graph_info_.clusters_[small_cluster]; } void ge::GraphPartitioner::RemoveEdge(size_t parent_cluster, size_t child_cluster) { - clusters_[child_cluster]->in_clu_.erase(parent_cluster); - clusters_[parent_cluster]->out_clu_.erase(child_cluster); + graph_info_.clusters_[child_cluster]->in_clu_.erase(parent_cluster); + graph_info_.clusters_[parent_cluster]->out_clu_.erase(child_cluster); } void ge::GraphPartitioner::InsertEdge(size_t from, size_t to) { if (from == to) { return; } - if (!clusters_[from]->out_clu_.insert(to).second) { + if (!graph_info_.clusters_[from]->out_clu_.insert(to).second) { // edge has already exists return; } - clusters_[to]->in_clu_.insert(from); + graph_info_.clusters_[to]->in_clu_.insert(from); } void ge::GraphPartitioner::MarkClusters() { - GELOGI("MarkClusters starts. cluster size is %zu", clusters_.size()); - size_t cluster_size = clusters_.size(); + GELOGI("MarkClusters starts. cluster size is %zu", graph_info_.clusters_.size()); + size_t cluster_size = graph_info_.clusters_.size(); for (size_t child_cluster = 0; child_cluster < cluster_size; child_cluster++) { - auto found_child_cluster = clusters_[child_cluster]; + auto found_child_cluster = graph_info_.clusters_[child_cluster]; if (found_child_cluster == nullptr) { GELOGW("can not found child_cluster is %zu", child_cluster); continue; @@ -684,7 +716,8 @@ void ge::GraphPartitioner::MarkClusters() { } // sort cluster according to it's output amount auto comp_func = [this](const size_t &parent_cluster1, const size_t &parent_cluster2) -> bool { - return clusters_[parent_cluster1]->out_clu_.size() < clusters_[parent_cluster2]->out_clu_.size(); + return graph_info_.clusters_[parent_cluster1]->out_clu_.size() < + graph_info_.clusters_[parent_cluster2]->out_clu_.size(); }; std::sort(ordered_cluster.begin(), ordered_cluster.end(), comp_func); auto child_merged = child_cluster; @@ -709,14 +742,14 @@ Status ge::GraphPartitioner::SplitSubGraphs(ge::ComputeGraphPtr compute_graph) { // add pld&end for (auto &node : compute_graph->GetDirectNode()) { GELOGD("Node name is %s.", node->GetName().c_str()); - auto child_cluster = node_2_cluster_[node]; + auto child_cluster = graph_info_.node_2_cluster_[node]; ge::ComputeGraphPtr corresponding_graph; // unordered_set's insert returns a pair, second of pair is bool if (!cluster_set.insert(child_cluster).second) { GELOGD("Old sub graph, child_cluster is %zu", child_cluster->index_); - corresponding_graph = cluster_2_partition_.at(child_cluster); + corresponding_graph = graph_info_.cluster_2_partition_.at(child_cluster); } else { - std::string graph_name = "new_sub_graph" + std::to_string(partitions_.size()); + std::string graph_name = "new_sub_graph" + std::to_string(graph_info_.partitions_.size()); ComputeGraphPtr new_sub_graph = MakeShared(graph_name); if (new_sub_graph == nullptr) { GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared() failed."); @@ -724,7 +757,7 @@ Status ge::GraphPartitioner::SplitSubGraphs(ge::ComputeGraphPtr compute_graph) { } AddNewGraphToPartition(new_sub_graph, child_cluster->engine_name_); corresponding_graph = new_sub_graph; - cluster_2_partition_[child_cluster] = corresponding_graph; + graph_info_.cluster_2_partition_[child_cluster] = corresponding_graph; GELOGD("New sub graph, name is %s", graph_name.c_str()); } // build node to corresponding node map @@ -733,20 +766,20 @@ Status ge::GraphPartitioner::SplitSubGraphs(ge::ComputeGraphPtr compute_graph) { GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: AddNode() failed."); return FAILED; } - corresponding_node_in_partitions_[node] = corresponding_node; + graph_info_.corresponding_node_in_partitions_[node] = corresponding_node; GE_CHK_STATUS_RET(corresponding_node->SetOwnerComputeGraph(corresponding_graph)) for (const auto &in_anchor : node->GetAllInAnchors()) { GELOGD("In anchor index is %d", AnchorUtils::GetIdx(in_anchor)); for (auto &peer_out_anchor : in_anchor->GetPeerAnchors()) { GELOGD("Peer out anchor index is %d", AnchorUtils::GetIdx(peer_out_anchor)); // All nodes have a copy in corresponding_node_in_partitions_, so function at can not be execption - auto parent_node = corresponding_node_in_partitions_.at(peer_out_anchor->GetOwnerNode()); + auto parent_node = graph_info_.corresponding_node_in_partitions_.at(peer_out_anchor->GetOwnerNode()); GELOGD("Parent node name is %s", parent_node->GetName().c_str()); // add edge auto src_anchor = parent_node->GetOutAnchor(AnchorUtils::GetIdx(peer_out_anchor)); auto dst_anchor = corresponding_node->GetInAnchor(AnchorUtils::GetIdx(in_anchor)); // if child and parent's cluster is not same, add plc and end - auto parent_cluster = node_2_cluster_[peer_out_anchor->GetOwnerNode()]; + auto parent_cluster = graph_info_.node_2_cluster_[peer_out_anchor->GetOwnerNode()]; if (parent_cluster != child_cluster) { GELOGD("Parent cluster is %zu, child_cluster is %zu", parent_cluster->index_, child_cluster->index_); if (AddPlaceHolderEnd(peer_out_anchor, in_anchor) != ge::SUCCESS) { @@ -771,7 +804,7 @@ Status ge::GraphPartitioner::SplitSubGraphs(ge::ComputeGraphPtr compute_graph) { /// before calling this function, the direct path between src and dst are already removed. /// return true if a second path is found bool ge::GraphPartitioner::HasSecondPath(size_t src, size_t dst, size_t upper_bound) { - if (clusters_.at(src)->out_clu_.empty() || clusters_.at(dst)->in_clu_.empty()) { + if (graph_info_.clusters_.at(src)->out_clu_.empty() || graph_info_.clusters_.at(dst)->in_clu_.empty()) { return false; } /// Avoid recursion since stack space might be limited. @@ -782,7 +815,7 @@ bool ge::GraphPartitioner::HasSecondPath(size_t src, size_t dst, size_t upper_bo while (!temp_stack.empty()) { size_t cluster = temp_stack.back(); temp_stack.pop_back(); - ClusterPtr cur_cluster = clusters_[cluster]; + ClusterPtr cur_cluster = graph_info_.clusters_[cluster]; if (!visited.insert(cluster).second) { continue; } @@ -798,21 +831,40 @@ bool ge::GraphPartitioner::HasSecondPath(size_t src, size_t dst, size_t upper_bo return false; } -Status ge::GraphPartitioner::Partition(ge::ComputeGraphPtr compute_graph, vector &output_subgraphs, - Mode mode) { - ClearAllPartitionData(mode); +Status ge::GraphPartitioner::Partition(ge::ComputeGraphPtr compute_graph, Mode mode) { + graph_2_graph_partition_info_.clear(); + graph_2_subgraph_list_.clear(); + auto ret = PartitionSubGraph(compute_graph, mode); + if (ret != SUCCESS) { + GELOGE(ret, "Sub graph partition Failed"); + return ret; + } + // partition sub graph + for (const auto &sub_graph : compute_graph->GetAllSubgraphs()) { + ret = PartitionSubGraph(sub_graph, mode); + if (ret != SUCCESS) { + GELOGE(ret, "Sub graph partition Failed"); + return ret; + } + } + return SUCCESS; +} + +Status ge::GraphPartitioner::PartitionSubGraph(ge::ComputeGraphPtr compute_graph, Mode mode) { if (compute_graph == nullptr) { GELOGE(GE_GRAPH_NULL_INPUT, "[GraphPartitioner]: compute_graph is null."); return FAILED; } - output_name_ = compute_graph->GetOutput(); - output_size_ = compute_graph->GetOutputSize(); - input_size_ = compute_graph->GetInputSize(); - if (output_size_ == 0) { + // clear graph_info + graph_info_.ClearAllData(mode); + graph_info_.output_name_ = compute_graph->GetOutput(); + graph_info_.output_size_ = compute_graph->GetOutputSize(); + graph_info_.input_size_ = compute_graph->GetInputSize(); + if (graph_info_.output_size_ == 0) { GELOGE(GE_GRAPH_NULL_INPUT, "The output size need to be greater than 0."); return FAILED; } - GELOGI("Graph partition starts, graph nodes size is %zu", compute_graph->GetDirectNodesSize()); + GELOGI("Graph Partition starts, graph nodes size is %zu", compute_graph->GetDirectNodesSize()); Status ret = compute_graph->TopologicalSorting(); if (ret != SUCCESS) { GELOGE(GE_GRAPH_TOPO_SORT_FAILED, "[GraphPartitioner]: subGraphPtr->TopologicalSorting failed"); @@ -840,15 +892,18 @@ Status ge::GraphPartitioner::Partition(ge::ComputeGraphPtr compute_graph, vector } GE_TIMESTAMP_END(GraphPartitionSortSubGraphs, "GraphPartitioner::PartitionSortSubGraphs"); GE_TIMESTAMP_START(GraphPartitionAddPartitionsToGraphNode); + vector output_subgraphs; if (AddPartitionsToGraphNode(output_subgraphs, compute_graph) != ge::SUCCESS) { GELOGE(GE_GRAPH_EMPTY_PARTITION, "Graph Partition AddPartitionsToGraphNode failed."); return ge::FAILED; } GE_TIMESTAMP_END(GraphPartitionAddPartitionsToGraphNode, "GraphPartitioner::PartitionAddPartitionsToGraphNode"); - GELOGI("Graph partition ends. Adding partitions to SubGraphInfo, got %zu sub graphs", output_subgraphs.size()); - mode_ = kMerging; + GELOGI("Graph Partition ends. Adding partitions to SubGraphInfo, got %zu sub graphs", output_subgraphs.size()); + graph_info_.mode_ = kMerging; // do not care over flow partition_times_++; + graph_2_graph_partition_info_[compute_graph] = graph_info_; + graph_2_subgraph_list_[compute_graph] = output_subgraphs; return SUCCESS; } @@ -866,8 +921,10 @@ Status ge::GraphPartitioner::AddPlaceHolderEnd(const AnchorPtr &out_anchor, cons return FAILED; } // All nodes have a copy in corresponding_node_in_partitions_, so function at can not be execption - auto src_anchor = corresponding_node_in_partitions_.at(src_node)->GetOutAnchor(AnchorUtils::GetIdx(out_anchor)); - auto dst_anchor = corresponding_node_in_partitions_.at(dst_node)->GetInAnchor(AnchorUtils::GetIdx(in_anchor)); + auto src_anchor = + graph_info_.corresponding_node_in_partitions_.at(src_node)->GetOutAnchor(AnchorUtils::GetIdx(out_anchor)); + auto dst_anchor = + graph_info_.corresponding_node_in_partitions_.at(dst_node)->GetInAnchor(AnchorUtils::GetIdx(in_anchor)); if ((src_anchor == nullptr) || (dst_anchor == nullptr)) { GELOGE(GE_GRAPH_PARAM_NULLPTR, "src_anchor or dst_anchor is null."); return FAILED; @@ -893,29 +950,29 @@ Status ge::GraphPartitioner::SortSubGraphs(const ge::ComputeGraphPtr &compute_gr } for (const auto &node : compute_graph->GetDirectNode()) { // All nodes in original graph have a copy in corresponding_node_in_partitions_, so it can not be null - auto sub_graph = corresponding_node_in_partitions_.at(node)->GetOwnerComputeGraph(); - if ((partitions_2_rank_.find(sub_graph) == partitions_2_rank_.end()) && - (partitions_[sub_graph] != kEngineDefaultData)) { - partitions_2_rank_[sub_graph] = rank; - rank_2_partitions_.push_back(sub_graph); + auto sub_graph = graph_info_.corresponding_node_in_partitions_.at(node)->GetOwnerComputeGraph(); + if ((graph_info_.partitions_2_rank_.find(sub_graph) == graph_info_.partitions_2_rank_.end()) && + (graph_info_.partitions_[sub_graph] != kEngineDefaultData)) { + graph_info_.partitions_2_rank_[sub_graph] = rank; + graph_info_.rank_2_partitions_.push_back(sub_graph); rank++; - } else if (partitions_[sub_graph] == kEngineDefaultData) { // merge data graph + } else if (graph_info_.partitions_[sub_graph] == kEngineDefaultData) { // merge data graph if (PutInputNodesInSubGraph(sub_graph, new_input_nodes_sub_graph) != SUCCESS) { GELOGE(FAILED, "[GraphPartitioner]: putInputNodesInSubGraph failed."); return FAILED; } - auto to_be_del = partitions_.find(sub_graph); - partitions_.erase(to_be_del); + auto to_be_del = graph_info_.partitions_.find(sub_graph); + graph_info_.partitions_.erase(to_be_del); } } if (!new_input_nodes_sub_graph->GetDirectNode().empty()) { - rank_2_partitions_.insert(rank_2_partitions_.begin(), new_input_nodes_sub_graph); - partitions_2_rank_[new_input_nodes_sub_graph] = 0; + graph_info_.rank_2_partitions_.insert(graph_info_.rank_2_partitions_.begin(), new_input_nodes_sub_graph); + graph_info_.partitions_2_rank_[new_input_nodes_sub_graph] = 0; AddNewGraphToPartition(new_input_nodes_sub_graph, "inputNodesSubGraph"); } // reinit rank rank = kRankZero; - for (const auto &it : rank_2_partitions_) { + for (const auto &it : graph_info_.rank_2_partitions_) { // rename subGraph based on rank if (it != nullptr) { // rename subGraph based on rank @@ -965,15 +1022,17 @@ void ge::GraphPartitioner::AddEndPldInformationToSubGraphInfo(ge::SubGraphInfoPt GE_CHECK_NOTNULL_JUST_RETURN(sub_graph); NodetoNodeMap end_map; NodetoNodeMap pld_map; - for (const auto &node : sub_graph->GetAllNodes()) { + for (const auto &node : sub_graph->GetDirectNode()) { if (node->GetType() == kEndType) { - end_map[node] = end_2_pld_.at(node); + end_map[node] = graph_info_.end_2_pld_.at(node); } if (node->GetType() == kPlaceHolderType) { - pld_map[node] = pld_2_end_.at(node); + pld_map[node] = graph_info_.pld_2_end_.at(node); } } sub_graph_info->SetEnd2PldMap(end_map); sub_graph_info->SetPld2EndMap(pld_map); } + +const Graph2SubGraphInfoList &ge::GraphPartitioner::GetSubGraphMap() { return graph_2_subgraph_list_; } } // namespace ge diff --git a/src/ge/graph/partition/graph_partition.h b/src/ge/graph/partition/graph_partition.h index 3ac18e48..51cafb47 100644 --- a/src/ge/graph/partition/graph_partition.h +++ b/src/ge/graph/partition/graph_partition.h @@ -57,18 +57,23 @@ class GraphPartitioner { /// MergeAfterSubGraphOptimization() can only be called in Merge mode. /// After Partition(), change to Merge mode. After MergeAfterSubGraphOptimization(), change to Partition mode enum Mode { kPartitioning, kSecondPartitioning, kMerging }; - GraphPartitioner() : mode_(GraphPartitioner::kPartitioning) {} + GraphPartitioner() : partition_times_(0){}; ~GraphPartitioner() = default; // the main method that partitions the graph // input_size and output_size are the number of inputs and outputs in the original graph - Status Partition(ComputeGraphPtr compute_graph, vector &output_subgraphs, Mode mode); + Status Partition(ComputeGraphPtr compute_graph, Mode mode); // after partition, all SubGraph will be merged back based on end<->pld. Status MergeAfterSubGraphOptimization(ComputeGraphPtr &output_merged_compute_graph, - const std::vector &sub_graph_list); + const ComputeGraphPtr &original_compute_graph); + // Return all subgraphs + const Graph2SubGraphInfoList &GetSubGraphMap(); private: + Status MergeSubGraph(ge::ComputeGraphPtr &output_merged_compute_graph, + const ge::ComputeGraphPtr &original_compute_graph); + Status PartitionSubGraph(ge::ComputeGraphPtr compute_graph, Mode mode); Status MergeAllSubGraph(ComputeGraphPtr &output_merged_compute_graph, const std::vector &sub_graph_list); Status CheckIfEnd2PldEmpty(ComputeGraphPtr &output_merged_compute_graph); @@ -129,25 +134,47 @@ class GraphPartitioner { void ClearAllPartitionData(Mode mode); void SetMergedGraphId(ComputeGraphPtr &output_merged_compute_graph); - // private local variables - EnginePlacer engine_placer_; - PartitionMap partitions_; // sub-graphs after partition - std::unordered_map partitions_2_rank_; // - std::vector rank_2_partitions_; // - NodetoNodeMap corresponding_node_in_partitions_; // mapping between a node in the original graph and - uint32_t num_of_pld_end_ = 0; // a counter to track 'place holder' and 'end' - size_t input_size_ = 0; - size_t output_size_ = 0; - std::string output_name_; - NodetoNodeMap end_2_pld_; // mapping between each 'end; and 'placeHolder' node - NodetoNodeMap pld_2_end_; // mapping between each 'placeHolder' and 'end' node - std::map index_2_end_; // order mapping between peerindex and 'end' node - Mode mode_ = kPartitioning; - uint32_t partition_times_ = 0; // times of call partition - std::vector transfer_graph_; // contains all transfer graphs - std::unordered_map clusters_; // index to cluster ptr, contains all nodes - std::unordered_map> node_2_cluster_; // node map to cluster - std::unordered_map, ComputeGraphPtr> cluster_2_partition_; // cluster map to subgraph + struct GraphPartitionInfo { + EnginePlacer engine_placer_; + PartitionMap partitions_; // sub-graphs after partition + std::unordered_map partitions_2_rank_; // + std::vector rank_2_partitions_; // + NodetoNodeMap corresponding_node_in_partitions_; // mapping between a node in the original graph and + uint32_t num_of_pld_end_; // a counter to track 'place holder' and 'end' + size_t input_size_; + size_t output_size_; + std::string output_name_; + NodetoNodeMap end_2_pld_; // mapping between each 'end; and 'placeHolder' node + NodetoNodeMap pld_2_end_; // mapping between each 'placeHolder' and 'end' node + std::map index_2_end_; // order mapping between peerindex and 'end' node + Mode mode_; + std::unordered_map clusters_; // index to cluster ptr, contains all nodes + std::unordered_map> node_2_cluster_; // node map to cluster + std::unordered_map, ComputeGraphPtr> cluster_2_partition_; // cluster map to subgraph + void ClearAllData(Mode mode) { + rank_2_partitions_.clear(); + partitions_2_rank_.clear(); + partitions_.clear(); + corresponding_node_in_partitions_.clear(); + index_2_end_.clear(); + cluster_2_partition_.clear(); + clusters_.clear(); + node_2_cluster_.clear(); + pld_2_end_.clear(); + end_2_pld_.clear(); + if (mode_ == kMerging) { + mode_ = kPartitioning; + } else { + mode_ = mode; + } + } + GraphPartitionInfo() : num_of_pld_end_(0), input_size_(0), output_size_(0), mode_(kPartitioning) {} + ~GraphPartitionInfo() = default; + }; + std::unordered_map graph_2_graph_partition_info_; + Graph2SubGraphInfoList graph_2_subgraph_list_; + GraphPartitionInfo graph_info_; + uint32_t partition_times_; // times of call partition }; } // namespace ge diff --git a/src/ge/graph/passes/aicpu_constant_folding_pass.cc b/src/ge/graph/passes/aicpu_constant_folding_pass.cc index 24e58656..e1e6842f 100644 --- a/src/ge/graph/passes/aicpu_constant_folding_pass.cc +++ b/src/ge/graph/passes/aicpu_constant_folding_pass.cc @@ -32,18 +32,18 @@ namespace { const char *const kKernelLibName = "aicpu_kernel"; +const char *const kNotSupported = "0"; const uint64_t kReleaseFlag = 1; +const uint64_t kOpsFlag = 1; const uint64_t kDouble = 2; } // namespace namespace ge { Status AicpuConstantFoldingPass::Run(ge::NodePtr &node) { GE_CHECK_NOTNULL(node); - GELOGD("Begin to run aicpu constant folding on node %s", node->GetName().c_str()); - if (node->GetType() == NETOUTPUT) { - GELOGI("Skip aicpu constant folding on node[netoutput] %s", node->GetName().c_str()); + GELOGD("Start aicpu constant folding on node [%s]", node->GetName().c_str()); + if (IsSkipFold(node)) { return SUCCESS; } - vector weight_vec; bool flag = CheckInput(node, weight_vec); if (!flag) { @@ -110,7 +110,7 @@ bool AicpuConstantFoldingPass::CheckInput(const NodePtr &node, vectorGetInputsSize()) { - GELOGD("Const input nodes size is %zu, and nodeDesc inputsSize is %zu.", input_nodes.size(), + GELOGD("Const input nodes size is %zu, and nodeDesc inputsSize is %zu, skip fold.", input_nodes.size(), node_desc->GetInputsSize()); return false; } @@ -166,12 +166,16 @@ Status AicpuConstantFoldingPass::GenerateDataPtrInfo(const vector &out GE_CHK_RT_RET(rtMalloc(&raw_data_addr, result_summary.raw_data_size, RT_MEMORY_HBM)); void *shape_data_addr = nullptr; - rtError_t rt_ret = rtMalloc(&shape_data_addr, result_summary.shape_data_size, RT_MEMORY_HBM); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMalloc error"); - GE_CHK_RT(rtFree(raw_data_addr)); - return FAILED; + // shape_data_size = 0 means scalar + if (result_summary.shape_data_size != 0) { + rtError_t rt_ret = rtMalloc(&shape_data_addr, result_summary.shape_data_size, RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(rt_ret, "rtMalloc error"); + GE_CHK_RT(rtFree(raw_data_addr)); + return FAILED; + } } + DataPtrInfo raw_data_info; raw_data_info.release_flag = kReleaseFlag; raw_data_info.data_size = result_summary.raw_data_size; @@ -192,7 +196,7 @@ Status AicpuConstantFoldingPass::GenerateDataPtrInfo(const vector &out return SUCCESS; } -Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_OP_KERNEL &task) const { +Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_OP_KERNEL &task) { // Update the workspace_addr if (task_info.empty()) { GELOGE(FAILED, "task_info is empty "); @@ -213,8 +217,7 @@ Status AicpuConstantFoldingPass::UpdateWorkSpaceAddr(string &task_info, STR_FWK_ return SUCCESS; } -Status AicpuConstantFoldingPass::UpdateInputAndOutputAddr(const vector &io_addrs, - STR_FWK_OP_KERNEL &task) const { +Status AicpuConstantFoldingPass::UpdateInputAndOutputAddr(const vector &io_addrs, STR_FWK_OP_KERNEL &task) { auto addrs_size = sizeof(uint64_t) * (io_addrs.size()); if (addrs_size <= 0) { GELOGE(FAILED, "addrs_size is less than 1 "); @@ -414,7 +417,7 @@ Status AicpuConstantFoldingPass::LaunchMemCopyTask(const vector &data_ return SUCCESS; } -Status AicpuConstantFoldingPass::GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_task, void *&task_buf) const { +Status AicpuConstantFoldingPass::GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_task, void *&task_buf) { GE_CHK_RT_RET(rtMalloc(&task_buf, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM)); rtError_t rt_ret = rtMemcpy(task_buf, sizeof(STR_FWK_OP_KERNEL), reinterpret_cast(&aicpu_task), @@ -427,7 +430,7 @@ Status AicpuConstantFoldingPass::GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_ return SUCCESS; } -Status AicpuConstantFoldingPass::KernelLaunch(void *task_buf) const { +Status AicpuConstantFoldingPass::KernelLaunch(void *task_buf) { rtModel_t model = nullptr; rtStream_t stream = nullptr; rtStream_t stream_run = nullptr; @@ -517,10 +520,17 @@ Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, co GE_IF_BOOL_EXEC(output_ptr->SetData(data_addr.get(), raw_data_size) != GRAPH_SUCCESS, GELOGE(FAILED, "set data failed"); return FAILED); - GELOGI("GenerateGeTensor: raw_data_size %lu", raw_data_size); + GELOGD("GenerateGeTensor: raw_data_size %lu", raw_data_size); const DataPtrInfo &shape_data_info = data_vec.at(i * kDouble + 1); uint64_t shape_data_size = shape_data_info.data_size; + GELOGD("GenerateGeTensor: shape_data_size %lu", shape_data_size); + if (shape_data_size == 0) { + GELOGW("node[%s] outshape is scalar, skip copy shape", node_desc->GetName().c_str()); + output_ptr->MutableTensorDesc().SetShape(GeShape()); + outputs.emplace_back(output_ptr); + continue; + } uint64_t dim_num = shape_data_size / sizeof(uint64_t); std::unique_ptr shape_addr(new (std::nothrow) int64_t[dim_num]()); if (shape_addr == nullptr) { @@ -530,13 +540,12 @@ Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, co GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), shape_data_size, reinterpret_cast(reinterpret_cast(shape_data_info.dst_ptr)), shape_data_size, RT_MEMCPY_DEVICE_TO_HOST)); - std::vector shapeDims; - for (size_t idx = 0; idx < dim_num; idx++) { - shapeDims.push_back(shape_addr[idx]); - GELOGI("GenerateGeTensor: dim %ld", shape_addr[idx]); + std::vector shape_dims; + for (size_t j = 0; j < dim_num; j++) { + shape_dims.push_back(shape_addr[j]); + GELOGD("GenerateGeTensor: dim %ld", shape_addr[j]); } - output_ptr->MutableTensorDesc().SetShape(GeShape(shapeDims)); - + output_ptr->MutableTensorDesc().SetShape(GeShape(shape_dims)); outputs.emplace_back(output_ptr); } return SUCCESS; @@ -544,7 +553,7 @@ Status AicpuConstantFoldingPass::GenerateGeTensor(const OpDescPtr &node_desc, co void AicpuConstantFoldingPass::ReleaseMemory(const vector &input_addrs, const vector &output_addrs, - const vector &data_vec) const { + const vector &data_vec) { for (const auto &item : input_addrs) { GE_CHK_RT(rtFree(reinterpret_cast(reinterpret_cast(item.input_addr)))); } @@ -552,7 +561,38 @@ void AicpuConstantFoldingPass::ReleaseMemory(const vector &input_ad GE_CHK_RT(rtFree(reinterpret_cast(reinterpret_cast(item)))); } for (const auto &item : data_vec) { - GE_CHK_RT(rtFree(reinterpret_cast(reinterpret_cast(item.dst_ptr)))); + auto dst_ptr = reinterpret_cast(reinterpret_cast(item.dst_ptr)); + if (dst_ptr != nullptr) { + GE_CHK_RT(rtFree(dst_ptr)); + } + } +} + +bool AicpuConstantFoldingPass::IsSkipFold(const ge::NodePtr &node) { + GE_CHECK_NOTNULL(node); + string type = node->GetType(); + if (type == ge::FRAMEWORKOP) { + if (!ge::AttrUtils::GetStr(node->GetOpDesc(), ge::ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type)) { + GELOGW("Skip aicpu constant folding on frameworkop node [%s]", node->GetName().c_str()); + return true; + } + } + auto instance_ptr = ge::GELib::GetInstance(); + if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GE is not initialized"); + return true; + } + OpsKernelInfoStorePtr kernel_info = instance_ptr->OpsKernelManagerObj().GetOpsKernelInfoStore(kKernelLibName); + if (kernel_info == nullptr) { + GELOGE(FAILED, "Get op kernel info store failed"); + return true; + } + std::string check_result; + kernel_info->opsFlagCheck(*node, check_result); + if (check_result.empty()) { + GELOGE(FAILED, "Get op check_result failed"); + return true; } + return check_result.substr(0, kOpsFlag) == kNotSupported; } } // namespace ge diff --git a/src/ge/graph/passes/aicpu_constant_folding_pass.h b/src/ge/graph/passes/aicpu_constant_folding_pass.h index 615bf060..02babd8e 100644 --- a/src/ge/graph/passes/aicpu_constant_folding_pass.h +++ b/src/ge/graph/passes/aicpu_constant_folding_pass.h @@ -17,8 +17,8 @@ #ifndef GE_GRAPH_PASSES_AICPU_CONSTANT_FOLDING_PASS_H_ #define GE_GRAPH_PASSES_AICPU_CONSTANT_FOLDING_PASS_H_ -#include #include +#include #include "common/opskernel/ops_kernel_info_store.h" #include "graph/passes/folding_pass.h" @@ -43,15 +43,16 @@ class AicpuConstantFoldingPass : public FoldingPass { uint64_t dst_ptr; } __attribute__((packed)); bool CheckInput(const ge::NodePtr &node, vector &weight_vec); + bool IsSkipFold(const ge::NodePtr &node); Status GetInputAddrs(const vector &weight_vec, vector &input_addrs); Status GetOutputAddrs(const OpDescPtr &node_desc, vector &output_addrs); - Status GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_task, void *&task_buf) const; + Status GenerateTaskForLaunch(STR_FWK_OP_KERNEL &aicpu_task, void *&task_buf); Status GenerateDataPtrInfo(const vector &output_addrs, vector &data_vec, vector &data_infos); Status GenerateGeTensor(const OpDescPtr &node_desc, const vector &data_vec, vector &outputs); - Status UpdateWorkSpaceAddr(string &task_info, STR_FWK_OP_KERNEL &task) const; - Status UpdateInputAndOutputAddr(const vector &io_addrs, STR_FWK_OP_KERNEL &task) const; + Status UpdateWorkSpaceAddr(string &task_info, STR_FWK_OP_KERNEL &task); + Status UpdateInputAndOutputAddr(const vector &io_addrs, STR_FWK_OP_KERNEL &task); Status UpdateSingleOpAddr(string &task_info, const vector &input_addrs, const vector &outputs_addr_vec, STR_FWK_OP_KERNEL &task); Status UpdateMemCopyAddr(string &task_info, const vector &data_infos, vector &internal_addrs, @@ -60,8 +61,8 @@ class AicpuConstantFoldingPass : public FoldingPass { const vector &output_addrs); Status LaunchMemCopyTask(const vector &data_infos); void ReleaseMemory(const vector &input_addrs, const vector &output_addrs, - const vector &data_vec) const; - Status KernelLaunch(void *aicpu_task) const; + const vector &data_vec); + Status KernelLaunch(void *aicpu_task); }; } // namespace ge diff --git a/src/ge/graph/passes/atomic_addr_clean_pass.cc b/src/ge/graph/passes/atomic_addr_clean_pass.cc index 39f6a6d9..6c312efa 100644 --- a/src/ge/graph/passes/atomic_addr_clean_pass.cc +++ b/src/ge/graph/passes/atomic_addr_clean_pass.cc @@ -33,6 +33,7 @@ bool is_loop_graph = false; } namespace ge { Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) { + GE_TIMESTAMP_START(AtomicAddrCleanPass); if (graph == nullptr) { GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID; @@ -68,6 +69,7 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) { } } GELOGD("AtomicAddrCleanPass end."); + GE_TIMESTAMP_END(AtomicAddrCleanPass, "GraphManager::AtomicAddrCleanPass"); return SUCCESS; } @@ -155,7 +157,7 @@ NodePtr AtomicAddrCleanPass::InsertAtomicAddrCleanNode(ComputeGraphPtr &graph) { Status AtomicAddrCleanPass::LinkToAtomicNode(const NodePtr &atomic_node, NodePtr &atomic_clean_node) { GE_IF_BOOL_EXEC(atomic_node == nullptr || atomic_clean_node == nullptr, - GE_LOGE("param [atomic_node][atomic_clean_node] must not be null."); + DOMI_LOGE("param [atomic_node][atomic_clean_node] must not be null."); return PARAM_INVALID); InControlAnchorPtr in_ctrl_anchor = atomic_node->GetInControlAnchor(); OutControlAnchorPtr out_ctrl_anchor = atomic_clean_node->GetOutControlAnchor(); @@ -199,18 +201,6 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) { for (const auto &op_info : op_info_vec) { if (op_info.isAtomic) { GELOGI("Recognized atomic op %s from HCCL engine.", op_desc->GetName().c_str()); - // check peer input is DATA - for (auto &in_data_anchor : node->GetAllInDataAnchors()) { - if (in_data_anchor->GetPeerOutAnchor() != nullptr && - in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) { - auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode(); - if (peer_in_node->GetType() == DATA) { - (void)AttrUtils::SetBool(peer_in_node->GetOpDesc(), "_need_memset", true); // no need return - GELOGI("Recognized atomic op %s from HCCL engine and input is DATA.", op_desc->GetName().c_str()); - return false; - } - } - } hcom_node_vec_.push_back(node); return true; } diff --git a/src/ge/graph/passes/base_pass.cc b/src/ge/graph/passes/base_pass.cc index eba17790..2ac7e938 100644 --- a/src/ge/graph/passes/base_pass.cc +++ b/src/ge/graph/passes/base_pass.cc @@ -26,13 +26,15 @@ namespace ge { namespace { -const int kMaxRePassTimes = 1000; -const size_t kMaxOneInNodes = 1000; +constexpr int kMaxRePassTimes = 1000; +constexpr size_t kMaxOneInNodes = 1000; +// Each iteration, we take about 0.3k memory on the stack, we should change the recursion to loop later +constexpr int kMaxRecursiveDepth = 10; void GetAllNodesNoInputEdge(const ComputeGraphPtr &graph, std::queue &input_edge_nodes, std::unordered_set &nodes_seen, std::unordered_set &nodes_last) { nodes_last.clear(); - for (auto &node : graph->GetAllNodes()) { + for (auto &node : graph->GetDirectNode()) { if (node == nullptr) { continue; } @@ -113,6 +115,18 @@ Status RunPasses(NodePtr &node, const NamesToPass &names_to_passes, std::unorder return SUCCESS; } + +void SetFlagOption(NodePassOption option, NamesToPass names_to_pass) { + for (auto &name_to_pass : names_to_pass) { + name_to_pass.second->SetOption(option, ""); + } +} + +void ClearOption(NamesToPass names_to_pass) { + for (auto &name_to_pass : names_to_pass) { + name_to_pass.second->ClearOptions(); + } +} } // namespace Status BaseNodePass::IsolateAndDeleteNode(NodePtr &node, const std::vector &io_map) { @@ -120,8 +134,7 @@ Status BaseNodePass::IsolateAndDeleteNode(NodePtr &node, const std::vector GELOGE(FAILED, "parameter is null."); return FAILED; } - GELOGI("Prepare to isolate and delete node, name:%s, type:%s.", node->GetName().c_str(), - node->GetType().c_str()); + GELOGI("Prepare to isolate and delete node, name:%s, type:%s.", node->GetName().c_str(), node->GetType().c_str()); ComputeGraphPtr graph = node->GetOwnerComputeGraph(); if (graph == nullptr) { GELOGE(FAILED, "[%s] The owner graph must not be null.", node->GetName().c_str()); @@ -154,6 +167,18 @@ Status GEPass::Run(const NamesToPass &names_to_passes) { return INTERNAL_ERROR; } + if (depth_ > kMaxRecursiveDepth) { + GELOGE(PARAM_INVALID, + "The pass for root graph %s will be terminated because too many nesting" + " levels(%d) of subgraphs, last subgraph is %s", + root_graph_->GetName().c_str(), depth_, graph_->GetName().c_str()); + return PARAM_INVALID; + } + + return RunPassesOneGraph(names_to_passes); +} + +Status GEPass::RunPassesOneGraph(const NamesToPass &names_to_passes) { GELOGD("Begin to run pass on graph, passes count %zu", names_to_passes.size()); std::queue nodes; std::unordered_set nodes_seen; @@ -186,18 +211,39 @@ Status GEPass::Run(const NamesToPass &names_to_passes) { auto ret = RunPasses(node, names_to_passes, nodes_re_pass, nodes_deleted, nodes_seen); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, - "Failed to process passes on node %s type %s," - " error code: %u", - node->GetName().c_str(), node->GetType().c_str(), ret); - return INTERNAL_ERROR; + GELOGE(ret, "Failed to process passes on node %s type %s, error code: %u", node->GetName().c_str(), + node->GetType().c_str(), ret); + return ret; + } + + bool has_sub_graph = false; + ret = RunPassesOnSubGraph(node, names_to_passes, has_sub_graph); + if (ret != SUCCESS) { + GELOGE(ret, "Failed to run passes on the sub graph of node %s", node->GetName().c_str()); + return ret; + } + + if (has_sub_graph) { + GELOGD("There are subgraphs on node %s, run passes for for the second time", node->GetName().c_str()); + SetFlagOption(kOptimizeAfterSubGraph, names_to_passes); + ret = RunPasses(node, names_to_passes, nodes_re_pass, nodes_deleted, nodes_seen); + if (ret != SUCCESS) { + GELOGE(ret, "Failed to process passes on node %s type %s, error code: %u", node->GetName().c_str(), + node->GetType().c_str(), ret); + return ret; + } + + // There is only one option scene, so set and clear options around the `RunPasses` func. + // if there are more than one scene to set options, the `ClearOption` function + // should be called each time at the begin of the iteration + ClearOption(names_to_passes); } } for (auto &node : nodes_last) { bool all_in_nodes_seen = node->IsAllInNodesSeen(nodes_seen); if (all_in_nodes_seen && nodes_seen.insert(node.get()).second) { - nodes.push(node); + nodes.push(node); } } nodes_last.clear(); @@ -210,4 +256,25 @@ Status GEPass::Run(const NamesToPass &names_to_passes) { return SUCCESS; } +Status GEPass::RunPassesOnSubGraph(const NodePtr &node, const NamesToPass &names_to_passes, bool &has_sub_graph) { + auto sub_graph_names = node->GetOpDesc()->GetSubgraphInstanceNames(); + has_sub_graph = false; + for (const auto &name : sub_graph_names) { + auto graph = root_graph_->GetSubgraph(name); + if (graph == nullptr) { + GELOGW("Can not find the sub graph %s from node %s, the pass-process will skip it", name.c_str(), + node->GetName().c_str()); + continue; + } + has_sub_graph = true; + GELOGI("Begin to run passes on the sub graph %s of node %s", name.c_str(), node->GetName().c_str()); + GEPass pass(graph, root_graph_, depth_ + 1); + auto ret = pass.Run(names_to_passes); + if (ret != SUCCESS) { + GELOGE(ret, "Failed to run passes for sub graph %s from node %s", name.c_str(), node->GetName().c_str()); + return ret; + } + } + return SUCCESS; +} } // namespace ge diff --git a/src/ge/graph/passes/base_pass.h b/src/ge/graph/passes/base_pass.h index 53eab006..dfba581e 100644 --- a/src/ge/graph/passes/base_pass.h +++ b/src/ge/graph/passes/base_pass.h @@ -29,6 +29,16 @@ #include "graph/utils/op_desc_utils.h" namespace ge { +enum NodePassOption { + // if there is a sub graph on the node, the pass on the node will do: + // Pass(node) -> pass all sub graphs on the node -> Pass(node) + // when pass the node for the second time, the kOptimizeAfterSubGraph will be set as a flag key + kOptimizeAfterSubGraph, + + // add new options before kOptionEnd + kOptionEnd +}; + class BaseNodePass { public: /// @@ -45,6 +55,10 @@ class BaseNodePass { std::unordered_set GetNodesDeleted() { return nodes_deleted_; } + void SetOption(NodePassOption option, const std::string &value) { options_[option] = value; } + + void ClearOptions() { options_.clear(); } + void init() { nodes_need_re_pass_.clear(); nodes_deleted_.clear(); @@ -91,21 +105,30 @@ class BaseNodePass { /// void AddNodeDeleted(Node *node) { nodes_deleted_.insert(node); } + bool OptionExists(NodePassOption option) { return options_.count(option) > 0; } + private: std::unordered_set nodes_need_re_pass_; std::unordered_set nodes_deleted_; + std::map options_; }; using NamesToPass = std::vector>; class GEPass { public: - explicit GEPass(ComputeGraphPtr &graph) : graph_(graph) {} + explicit GEPass(ComputeGraphPtr &graph) : graph_(graph), root_graph_(graph), depth_(1) {} virtual ~GEPass() = default; Status Run(const NamesToPass &names_to_passes); private: + GEPass(ComputeGraphPtr &graph, ComputeGraphPtr &root_graph, int depth) + : graph_(graph), root_graph_(root_graph), depth_(depth) {} + Status RunPassesOneGraph(const NamesToPass &names_to_passes); + Status RunPassesOnSubGraph(const NodePtr &node, const NamesToPass &names_to_passes, bool &has_sub_graph); ComputeGraphPtr graph_; + ComputeGraphPtr root_graph_; + int depth_; }; } // namespace ge diff --git a/src/ge/graph/passes/cast_remove_pass.cc b/src/ge/graph/passes/cast_remove_pass.cc new file mode 100644 index 00000000..a0742a03 --- /dev/null +++ b/src/ge/graph/passes/cast_remove_pass.cc @@ -0,0 +1,138 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/cast_remove_pass.h" +#include +#include +#include "framework/common/debug/ge_log.h" +#include "graph/common/transop_util.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/type_utils.h" + +namespace ge { +Status CastRemovePass::Run(NodePtr &node) { + if (node == nullptr) { + GELOGE(PARAM_INVALID, "Param [node] must not be null."); + return PARAM_INVALID; + } + OpDescPtr op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + GELOGE(PARAM_INVALID, "OpDesc of param [node] must not be null."); + return PARAM_INVALID; + } + + // begin with not trans op, and only has one out data node + if (TransOpUtil::IsTransOp(node) || node->GetOutDataNodesSize() != 1) { + return SUCCESS; + } + + std::vector nodes_to_fuse; + NodePtr end_node = GetTheEndNode(node, nodes_to_fuse); + if (nodes_to_fuse.empty()) { + return SUCCESS; + } + OpDescPtr end_op_desc = end_node->GetOpDesc(); + if (end_op_desc == nullptr) { + GELOGE(PARAM_INVALID, "OpDesc of end node must not be null."); + return PARAM_INVALID; + } + + DataType type = DT_UNDEFINED; + if (!HasSameDataType(op_desc, end_op_desc, type)) { + return SUCCESS; + } + if (RemoveCast(type, nodes_to_fuse) != SUCCESS) { + return FAILED; + } + return SUCCESS; +} + +bool CastRemovePass::HasSameDataType(OpDescPtr &begin_op_desc, OpDescPtr &end_op_desc, DataType &type) const { + if (begin_op_desc->GetName() == end_op_desc->GetName()) { + return false; + } + auto end_out_desc = end_op_desc->MutableOutputDesc(0); + DataType end_out_datatype = end_out_desc->GetDataType(); + + auto begin_out_desc = begin_op_desc->MutableOutputDesc(0); + DataType begin_out_datatype = begin_out_desc->GetDataType(); + + if (begin_out_datatype == end_out_datatype && (begin_out_datatype == DT_FLOAT16 || begin_out_datatype == DT_FLOAT)) { + type = begin_out_datatype; + return true; + } + return false; +} + +// op1->TransData->Cast->TransposeD->Cast->TransData->op2 +// change to be +// op1->TransData->TransposeD->TransData->op2 +Status CastRemovePass::RemoveCast(DataType &type, std::vector &nodes_to_fuse) { + string cast_name; + for (NodePtr node : nodes_to_fuse) { + if (node->GetType() == CAST) { + GELOGI("CastRemovePass, remove Cast %s.", node->GetName().c_str()); + cast_name = node->GetName(); + if (IsolateAndDeleteNode(node, {0}) != SUCCESS) { + GELOGE(FAILED, "IsolateAndDeleteNode %s failed.", node->GetName().c_str()); + return FAILED; + } + } + } + + if (cast_name.empty()) { + return SUCCESS; + } + for (auto &node : nodes_to_fuse) { + if (node->GetType() == CAST) { + continue; + } + OpDescPtr op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + GELOGE(FAILED, "OpDesc must not be null."); + return FAILED; + } + + // change node name for recompile cache, will be abandoned in April + string new_node_name = cast_name + op_desc->GetName(); + op_desc->SetName(new_node_name); + // add attr to changed TransData, then will be rebuild + if (!AttrUtils::SetBool(op_desc, ATTR_NEED_COMPILE, true)) { + GELOGE(FAILED, "Set ATTR_NEED_COMPILE Attr fail."); + return FAILED; + } + auto in_desc = op_desc->MutableInputDesc(0); + auto out_desc = op_desc->MutableOutputDesc(0); + in_desc->SetDataType(type); + out_desc->SetDataType(type); + GELOGI("CastRemovePass, change %s %s datatype to be %s.", node->GetType().c_str(), node->GetName().c_str(), + TypeUtils::DataTypeToSerialString(type).c_str()); + } + return SUCCESS; +} + +NodePtr CastRemovePass::GetTheEndNode(NodePtr &begin_node, std::vector &nodes_to_fuse) { + while (begin_node->GetOutDataNodes().size() == 1) { + auto out_node = begin_node->GetOutDataNodes().at(0); + if (!TransOpUtil::IsTransOp(out_node)) { + return begin_node; // when seen not trans op + } + begin_node = out_node; + nodes_to_fuse.emplace_back(begin_node); + } + return begin_node; // when seen branch +} +} // namespace ge diff --git a/src/ge/graph/passes/update_net_output_pass.h b/src/ge/graph/passes/cast_remove_pass.h similarity index 55% rename from src/ge/graph/passes/update_net_output_pass.h rename to src/ge/graph/passes/cast_remove_pass.h index 571d2b9c..53318cff 100644 --- a/src/ge/graph/passes/update_net_output_pass.h +++ b/src/ge/graph/passes/cast_remove_pass.h @@ -14,27 +14,21 @@ * limitations under the License. */ -#ifndef GE_GRAPH_PASSES_UPDATE_NET_OUTPUT_PASS_H_ -#define GE_GRAPH_PASSES_UPDATE_NET_OUTPUT_PASS_H_ +#ifndef GE_GRAPH_PASSES_CAST_REMOVE_PASS_H_ +#define GE_GRAPH_PASSES_CAST_REMOVE_PASS_H_ -#include "graph/types.h" +#include #include "graph/passes/base_pass.h" -#include "framework/common/ge_inner_error_codes.h" -#include "framework/common/debug/ge_log.h" -#include "graph/utils/type_utils.h" -#include "graph/debug/ge_attr_define.h" namespace ge { -class ReUpdateNetOutputPass : public BaseNodePass { +class CastRemovePass : public BaseNodePass { public: - /// - /// Entry of the ReUpdateNetOutputPass optimizer - /// @param [in] node: Input node - /// @return SUCCESS: Execution succeed - /// @return OTHERS: Execution failed - /// @author - /// Status Run(NodePtr &node) override; + + private: + bool HasSameDataType(OpDescPtr &begin_op_desc, OpDescPtr &end_op_desc, DataType &type) const; + Status RemoveCast(DataType &type, std::vector &nodes_to_fuse); + NodePtr GetTheEndNode(NodePtr &begin_node, std::vector &nodes_to_fuse); }; } // namespace ge -#endif // GE_GRAPH_PASSES_UPDATE_NET_OUTPUT_PASS_H_ +#endif // GE_GRAPH_PASSES_CAST_REMOVE_PASS_H_ diff --git a/src/ge/graph/passes/cast_translate_pass.cc b/src/ge/graph/passes/cast_translate_pass.cc index e014db40..2d67b0a8 100644 --- a/src/ge/graph/passes/cast_translate_pass.cc +++ b/src/ge/graph/passes/cast_translate_pass.cc @@ -23,6 +23,7 @@ #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "graph/common/omg_util.h" +#include "graph/debug/ge_attr_define.h" #include "graph/passes/pass_utils.h" #include "graph/utils/node_utils.h" #include "graph/utils/type_utils.h" @@ -51,15 +52,13 @@ bool CastTranslatePass::CheckInAndOutDataAnchor(NodePtr &node) const { bool CastTranslatePass::IsCastNode(NodePtr &node) const { std::string original_type; - GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS, - GELOGW("get original type failed"); return false); + GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS, GELOGW("get original type failed"); return false); return (original_type == CAST); } bool CastTranslatePass::IsTranslateNode(NodePtr &node) const { std::string original_type; - GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS, - GELOGW("get original type failed"); return false); + GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS, GELOGW("get original type failed"); return false); return (original_type == TRANSLATE); } @@ -131,13 +130,21 @@ bool CastTranslatePass::IsOpSupportedOptimize(NodePtr &cast_node, NodePtr &trans OpDescPtr trans_op_desc = trans_node->GetOpDesc(); GE_IF_BOOL_EXEC(trans_op_desc == nullptr, GELOGW("trans_op_desc is null."); return false); // backup datatype - DataType trans_in_datatype = trans_op_desc->GetInputDesc(0).GetDataType(); - DataType trans_out_datatype = trans_op_desc->GetOutputDesc(0).GetDataType(); + const auto &trans_op_indesc = trans_op_desc->MutableInputDesc(0); + const auto &trans_op_outdesc = trans_op_desc->MutableOutputDesc(0); + GE_CHECK_NOTNULL_EXEC(trans_op_indesc, return false); + GE_CHECK_NOTNULL_EXEC(trans_op_outdesc, return false); + DataType trans_in_datatype = trans_op_indesc->GetDataType(); + DataType trans_out_datatype = trans_op_outdesc->GetDataType(); auto cast_op_desc = cast_node->GetOpDesc(); GE_IF_BOOL_EXEC(cast_op_desc == nullptr, GELOGW("cast_op_desc is null."); return false); - DataType cast_in_datatype = cast_op_desc->GetInputDesc(0).GetDataType(); - DataType cast_out_datatype = cast_op_desc->GetOutputDesc(0).GetDataType(); + const auto &cast_op_indesc = cast_op_desc->MutableInputDesc(0); + const auto &cast_op_outdesc = cast_op_desc->MutableOutputDesc(0); + GE_CHECK_NOTNULL_EXEC(cast_op_indesc, return false); + GE_CHECK_NOTNULL_EXEC(cast_op_outdesc, return false); + DataType cast_in_datatype = cast_op_indesc->GetDataType(); + DataType cast_out_datatype = cast_op_outdesc->GetDataType(); GELOGI("CastTranslatePass, cast in %s out %s, translate in %s out %s.", TypeUtils::DataTypeToSerialString(cast_in_datatype).c_str(), TypeUtils::DataTypeToSerialString(cast_out_datatype).c_str(), @@ -149,13 +156,13 @@ bool CastTranslatePass::IsOpSupportedOptimize(NodePtr &cast_node, NodePtr &trans // change Translate input datatype to be the input of Cast // then delete Cast // [MutableInputDesc guarantees non empty throughout the process] - trans_op_desc->MutableInputDesc(0)->SetDataType(cast_in_datatype); + trans_op_indesc->SetDataType(cast_in_datatype); } else { // Translate-->Cast-->A // change Translate output datatype to be the output of Cast // then delete Cast // [MutableInputDesc guarantees non empty throughout the process] - trans_op_desc->MutableOutputDesc(0)->SetDataType(cast_out_datatype); + trans_op_outdesc->SetDataType(cast_out_datatype); } if (!TranslateCheckAccuracySupported(trans_op_desc)) { @@ -169,16 +176,16 @@ bool CastTranslatePass::IsOpSupportedOptimize(NodePtr &cast_node, NodePtr &trans } if (is_src_cast) { - GE_IF_BOOL_EXEC( - !AttrUtils::SetInt(trans_op_desc, ATTR_NAME_INPUT_DATATYPE, static_cast(cast_in_datatype)), - GELOGW("set ATTR_NAME_INPUT_DATATYPE failed"); return false); + GE_IF_BOOL_EXEC(!AttrUtils::SetInt(trans_op_desc, ATTR_NAME_INPUT_DATATYPE, static_cast(cast_in_datatype)), + GELOGW("set ATTR_NAME_INPUT_DATATYPE failed"); + return false); } else { GE_IF_BOOL_EXEC( - !AttrUtils::SetInt(trans_op_desc, ATTR_NAME_OUTPUT_DATATYPE, static_cast(cast_out_datatype)), - GELOGW("set ATTR_NAME_INPUT_DATATYPE failed"); return false); + !AttrUtils::SetInt(trans_op_desc, ATTR_NAME_OUTPUT_DATATYPE, static_cast(cast_out_datatype)), + GELOGW("set ATTR_NAME_INPUT_DATATYPE failed"); + return false); } - GELOGI("CastTranslatePass, translate in %d out %d.", trans_op_desc->GetInputDesc(0).GetDataType(), - trans_op_desc->GetOutputDesc(0).GetDataType()); + GELOGI("CastTranslatePass, translate in %d out %d.", trans_op_indesc->GetDataType(), trans_op_outdesc->GetDataType()); return true; } diff --git a/src/ge/graph/passes/common_subexpression_elimination_pass.cc b/src/ge/graph/passes/common_subexpression_elimination_pass.cc new file mode 100644 index 00000000..f16be19f --- /dev/null +++ b/src/ge/graph/passes/common_subexpression_elimination_pass.cc @@ -0,0 +1,111 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common_subexpression_elimination_pass.h" + +#include +#include +#include + +#include "graph/utils/node_utils.h" +#include "ge_local_engine/engine/host_cpu_engine.h" +#include "graph/passes/folding_pass.h" + +namespace ge { +namespace { +std::string GetCseKey(const NodePtr &node) { + std::stringstream ss; + ss << node->GetType() << "-data-inputs-"; + for (auto &in_anchor : node->GetAllInDataAnchors()) { + auto src_anchor = in_anchor->GetPeerOutAnchor(); + if (src_anchor == nullptr) { + ss << in_anchor->GetIdx() << "-null-"; + } else { + ss << in_anchor->GetIdx() << "-" << src_anchor->GetOwnerNode()->GetName() << "-" << src_anchor->GetIdx() << "-"; + } + } + + ss << "control-inputs-"; + std::set control_in_node_names; + for (auto &src_node : node->GetInControlNodes()) { + control_in_node_names.insert(src_node->GetName()); + } + for (auto &name : control_in_node_names) { + ss << name << "-"; + } + + ss << "attrs-" << AttrUtils::GetAllAttrsStr(node->GetOpDesc()); + + return ss.str(); +} + +/// As the operator category has not been defined, we do not know what types of node can be processed by CSE. +/// To avoid delete wrong nodes(e.g. stateful nodes), +/// only nodes have folding kernel will be considered for the CSE process +bool IsNodeSupportCse(const NodePtr &node) { + if (HostCpuEngine::CheckSupported(NodeUtils::GetNodeType(*node))) { + return true; + } + return folding_pass::GetKernelByType(node) != nullptr; +} +} // namespace +Status CommonSubexpressionEliminationPass::Run(ComputeGraphPtr graph) { + GELOGD("Begin to run the CSE process on the graph"); + GE_CHECK_NOTNULL(graph); + std::map keys_to_node; + for (const auto &node : graph->GetAllNodes()) { + if (!IsNodeSupportCse(node)) { + continue; + } + auto key = GetCseKey(node); + auto iter = keys_to_node.find(key); + if (iter == keys_to_node.end()) { + keys_to_node[key] = node; + continue; + } + + if (node->GetAllOutDataAnchorsSize() != iter->second->GetAllOutDataAnchorsSize()) { + GELOGW("The node %s and %s have the same CSE key, but different output anchor count, skip to fusion them", + iter->second->GetName().c_str(), node->GetName().c_str()); + continue; + } + + std::vector output_map(node->GetAllOutDataAnchorsSize()); + for (size_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) { + output_map[i] = i; + } + + auto ret = GraphUtils::ReplaceNodeAnchors(iter->second, node, {}, output_map); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to replace node %s by node %s error node %u", node->GetName().c_str(), + iter->second->GetName().c_str(), ret); + return INTERNAL_ERROR; + } + + NodeUtils::UnlinkAll(*node); + + ret = GraphUtils::RemoveNodeWithoutRelink(graph, node); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to remove node %s from graph", node->GetName().c_str()); + return INTERNAL_ERROR; + } + + GELOGI("Remove node %s by the CSE process, replace it with node %s", node->GetName().c_str(), + iter->second->GetName().c_str()); + } + return SUCCESS; +} +} // namespace ge diff --git a/third_party/fwkacllib/inc/ops/aipp_data.h b/src/ge/graph/passes/common_subexpression_elimination_pass.h similarity index 65% rename from third_party/fwkacllib/inc/ops/aipp_data.h rename to src/ge/graph/passes/common_subexpression_elimination_pass.h index 5e3961ca..b5aecf6b 100644 --- a/third_party/fwkacllib/inc/ops/aipp_data.h +++ b/src/ge/graph/passes/common_subexpression_elimination_pass.h @@ -14,17 +14,16 @@ * limitations under the License. */ -#ifndef GE_OP_AIPP_DATA_H -#define GE_OP_AIPP_DATA_H +#ifndef GE_COMMON_SUBEXPRESSION_ELIMINATION_H_ +#define GE_COMMON_SUBEXPRESSION_ELIMINATION_H_ -#include "../graph/operator_reg.h" +#include "graph/types.h" +#include "inc/graph_pass.h" namespace ge { -REG_OP(AippData) - .INPUT(data, TensorType::ALL()) - .OUTPUT(out, TensorType::ALL()) - .ATTR(index, Int, 0) - .OP_END_FACTORY_REG(AippData) -} // namespace ge - -#endif // GE_OP_AIPP_DATA_H +class CommonSubexpressionEliminationPass : public GraphPass { + public: + Status Run(ge::ComputeGraphPtr graph) override; +}; +} // namespace ge +#endif // GE_COMMON_SUBEXPRESSION_ELIMINATION_H_ diff --git a/src/ge/graph/passes/compile_nodes_pass.cc b/src/ge/graph/passes/compile_nodes_pass.cc index 8f7438ea..f46b11f0 100644 --- a/src/ge/graph/passes/compile_nodes_pass.cc +++ b/src/ge/graph/passes/compile_nodes_pass.cc @@ -34,6 +34,7 @@ const char *const kAICPUKernelLibName = "aicpu_kernel"; namespace ge { graphStatus CompileNodesPass::Run(ComputeGraphPtr graph) { + GE_TIMESTAMP_START(CompileNodesPass); GELOGI("[CompileNodesPass]: optimize begin."); if (graph == nullptr) { return GRAPH_SUCCESS; @@ -77,6 +78,7 @@ graphStatus CompileNodesPass::Run(ComputeGraphPtr graph) { return result; } GELOGI("[CompileNodesPass]: Optimize success."); + GE_TIMESTAMP_END(CompileNodesPass, "GraphManager::CompileNodesPass"); return GRAPH_SUCCESS; } diff --git a/src/ge/graph/passes/compile_nodes_pass.h b/src/ge/graph/passes/compile_nodes_pass.h index 30d1d974..70f8cbf5 100644 --- a/src/ge/graph/passes/compile_nodes_pass.h +++ b/src/ge/graph/passes/compile_nodes_pass.h @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include "inc/graph_pass.h" #include "init/gelib.h" diff --git a/src/ge/graph/passes/constant_folding_pass.cc b/src/ge/graph/passes/constant_folding_pass.cc index dc4f4b90..3ac7feb6 100644 --- a/src/ge/graph/passes/constant_folding_pass.cc +++ b/src/ge/graph/passes/constant_folding_pass.cc @@ -21,6 +21,7 @@ #include "common/debug/log.h" #include "common/types.h" #include "framework/common/debug/ge_log.h" +#include "graph/operator_factory.h" #include "graph/utils/attr_utils.h" #include "graph/utils/node_utils.h" #include "graph/utils/op_desc_utils.h" @@ -32,37 +33,45 @@ Status ConstantFoldingPass::Run(ge::NodePtr &node) { GE_CHECK_NOTNULL(node); GELOGD("Begin to run constant folding on node %s", node->GetName().c_str()); - OpDescPtr node_desc = node->GetOpDesc(); - if (node_desc == nullptr) { + if (folding_pass::IsNoNeedConstantFolding(node)) { return SUCCESS; } + OpDescPtr node_desc = node->GetOpDesc(); DataType data_type = node_desc->GetOutputDesc(0).GetDataType(); Format format = node_desc->GetOutputDesc(0).GetFormat(); - GELOGD("current [node:%s, type:%s] info: format: %s, datatype:%s", node->GetName().c_str(), node->GetType().c_str(), + GELOGD("Current [node:%s, type:%s] info: format: %s, datatype:%s", node->GetName().c_str(), node->GetType().c_str(), TypeUtils::FormatToSerialString(format).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str()); auto input_nodes = OpDescUtils::GetConstInputNode(*node); if (input_nodes.empty() || input_nodes.size() != node_desc->GetInputsSize()) { - GELOGI("Const input nodes size is %zu, and nodeDesc inputsSize is %zu.", - input_nodes.size(), node_desc->GetInputsSize()); + GELOGD("Node:%s, const input nodes size is %zu, and nodeDesc inputsSize is %zu.", node->GetName().c_str(), + input_nodes.size(), node_desc->GetInputsSize()); return SUCCESS; } - auto op_kernel = folding_pass::GetKernelByType(node); - if (op_kernel == nullptr) { - GELOGD("No op kernel for node %s type %s, skip the constant folding", node->GetName().c_str(), - node->GetType().c_str()); - return SUCCESS; - } auto inputs = OpDescUtils::GetInputData(input_nodes); vector outputs; - auto ret = op_kernel->Compute(node_desc, inputs, outputs); + auto ret = RunOpKernel(node, inputs, outputs); if (ret != SUCCESS) { - if (ret == NOT_CHANGED) { + auto op_kernel = folding_pass::GetKernelByType(node); + if (op_kernel == nullptr) { + GELOGD("No op kernel for node %s type %s, skip the constant folding", node->GetName().c_str(), + node->GetType().c_str()); return SUCCESS; } - GELOGE(INTERNAL_ERROR, "Calculate for node %s failed in constant folding", node->GetName().c_str()); - return ret; + + ret = op_kernel->Compute(node_desc, inputs, outputs); + if (ret != SUCCESS) { + if (ret == NOT_CHANGED) { + GELOGD("Node %s type %s, compute terminates and exits the constant folding.", node->GetName().c_str(), + node->GetType().c_str()); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Calculate for node %s failed in constant folding", node->GetName().c_str()); + return ret; + } + GELOGI("Node %s type %s, constant folding compute success.", node->GetName().c_str(), node->GetType().c_str()); } + if (outputs.empty()) { GELOGE(INTERNAL_ERROR, "Failed to constant folding on node %s," diff --git a/src/ge/graph/passes/dimension_compute_pass.cc b/src/ge/graph/passes/dimension_compute_pass.cc index adf60737..a429e69d 100644 --- a/src/ge/graph/passes/dimension_compute_pass.cc +++ b/src/ge/graph/passes/dimension_compute_pass.cc @@ -29,7 +29,7 @@ namespace ge { Status DimensionComputePass::Run(ge::NodePtr &node) { GE_CHECK_NOTNULL(node); auto op_kernel = folding_pass::GetKernelByType(node); - if (op_kernel == nullptr) { + if (op_kernel == nullptr || folding_pass::IsNoNeedConstantFolding(node)) { return SUCCESS; } std::vector outputs; diff --git a/src/ge/graph/passes/flow_ctrl_pass.cc b/src/ge/graph/passes/flow_ctrl_pass.cc index 0b72c806..d144351d 100644 --- a/src/ge/graph/passes/flow_ctrl_pass.cc +++ b/src/ge/graph/passes/flow_ctrl_pass.cc @@ -29,7 +29,6 @@ namespace ge { // when namespace change to ge, please delete the using code. - Status FlowCtrlPass::Run(ComputeGraphPtr compute_graph) { GE_CHECK_NOTNULL(compute_graph); @@ -115,7 +114,7 @@ NodePtr FlowCtrlPass::InsertOp(ComputeGraphPtr &compute_graph, const string &nod } } - GE_IF_BOOL_EXEC(compute_graph == nullptr, GE_LOGE("compute_graph is nullptr"); return nullptr); + GE_IF_BOOL_EXEC(compute_graph == nullptr, DOMI_LOGE("compute_graph is nullptr"); return nullptr); NodePtr node = compute_graph->AddNode(op_desc); if (node == nullptr) { GELOGE(FAILED, "add node failed, name:%s, type:%s.", node_name.c_str(), node_type.c_str()); @@ -159,14 +158,14 @@ NodePtr FlowCtrlPass::InsertStreamSwitchOp(ComputeGraphPtr &compute_graph, const // stream switch op need switch cond by attr. GE_IF_BOOL_EXEC( !AttrUtils::SetInt(stream_switch->GetOpDesc(), ATTR_NAME_STREAM_SWITCH_COND, static_cast(RT_LESS)), - GE_LOGE("set ATTR_NAME_STREAM_SWITCH_COND failed"); + DOMI_LOGE("set ATTR_NAME_STREAM_SWITCH_COND failed"); return nullptr); return stream_switch; } NodePtr FlowCtrlPass::AddVariableNode(ComputeGraphPtr &compute_graph, const string &name) { - GE_IF_BOOL_EXEC(compute_graph == nullptr, GE_LOGE("compute_graph is nullptr"); return nullptr); + GE_IF_BOOL_EXEC(compute_graph == nullptr, DOMI_LOGE("compute_graph is nullptr"); return nullptr); NodePtr exist_node = compute_graph->FindNode(name); if (exist_node != nullptr) { GELOGD("Node %s already exist, no need add.", name.c_str()); @@ -194,6 +193,16 @@ Status FlowCtrlPass::AddGlobalStepVariableNode(ComputeGraphPtr &compute_graph) { GELOGD("Node %s can't be found in graph %u", NODE_NAME_NET_OUTPUT.c_str(), compute_graph->GetGraphID()); return SUCCESS; } + + if (compute_graph->GetParentGraph() != nullptr) { // Global step just add to main graph. + GELOGD("Graph %s no need global step variable.", compute_graph->GetName().c_str()); + uint32_t parent_index = 0; // Set to 0 as a mark for subgraph. + if (!AttrUtils::SetInt(output_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + GELOGW("Node: %s Add attr %s failed.", output_node->GetName().c_str(), ATTR_NAME_PARENT_NODE_INDEX.c_str()); + } + return SUCCESS; + } + NodePtr exist_node = compute_graph->FindNode(NODE_NAME_GLOBAL_STEP); if (exist_node != nullptr) { GELOGD("Node %s already exist, no need add.", NODE_NAME_GLOBAL_STEP.c_str()); @@ -284,7 +293,7 @@ Status FlowCtrlPass::CreateIterCtrlTrueBranch(ComputeGraphPtr &compute_graph, co } GE_CHK_STATUS_RET(SetStreamLabel(active_node, active_name), "set stream label failed"); GE_IF_BOOL_EXEC(!AttrUtils::SetBool(active_node->GetOpDesc(), ATTR_NAME_IS_LOOP_ACTIVE, true), - GE_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); + DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); return FAILED); // add ctrl edges @@ -337,7 +346,7 @@ Status FlowCtrlPass::CreateIterCtrlFalseBranch(ComputeGraphPtr &compute_graph, c } Status FlowCtrlPass::AddFpBpIteratorCtrl(ComputeGraphPtr &compute_graph, NodePtr &pre_node) { - GE_IF_BOOL_EXEC(pre_node == nullptr, GE_LOGE("pre_node is nullptr"); return FAILED); + GE_IF_BOOL_EXEC(pre_node == nullptr, DOMI_LOGE("pre_node is nullptr"); return FAILED); string pre_node_name = pre_node->GetName(); GELOGI("Add FpBp Iterator ctrl, pre node:%s.", pre_node_name.c_str()); // 1. Get or add variables @@ -413,7 +422,7 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, * itersPerLoop loopCond */ GE_IF_BOOL_EXEC(loop_after_node == nullptr || compute_graph == nullptr, - GE_LOGE("loop after node or compute graph is null"); + DOMI_LOGE("loop after node or compute graph is null"); return FAILED); InDataAnchorPtr in_anchor = loop_after_node->GetInDataAnchor(0); if (in_anchor == nullptr || in_anchor->GetPeerOutAnchor() == nullptr) { @@ -435,7 +444,7 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, } // 2. Add StreamSwitch and edges to switch_node. - GE_IF_BOOL_EXEC(loop_pre_node == nullptr, GE_LOGE("loop pre node is null"); return FAILED); + GE_IF_BOOL_EXEC(loop_pre_node == nullptr, DOMI_LOGE("loop pre node is null"); return FAILED); string switch_name = loop_pre_node->GetName() + "_" + NODE_NAME_STREAM_SWITCH; NodePtr switch_node = InsertStreamSwitchOp(compute_graph, switch_name, loop_cond_node, iter_per_loop_node); if (switch_node == nullptr) { @@ -469,7 +478,7 @@ Status FlowCtrlPass::AddSpecialNodeIteratorCtrl(ComputeGraphPtr &compute_graph, GE_CHK_STATUS_RET(SetStreamLabel(active_node, active_name), "set stream label failed"); GE_IF_BOOL_EXEC(!AttrUtils::SetBool(active_node->GetOpDesc(), ATTR_NAME_IS_LOOP_ACTIVE, true), - GE_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); + DOMI_LOGE("set ATTR_NAME_IS_LOOP_ACTIVE failed"); return FAILED); add_ret = GraphUtils::AddEdge(switch_node->GetOutControlAnchor(), active_node->GetInControlAnchor()); diff --git a/src/ge/graph/passes/folding_kernel/add_kernel.cc b/src/ge/graph/passes/folding_kernel/add_kernel.cc index 7fa79373..89f99938 100644 --- a/src/ge/graph/passes/folding_kernel/add_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/add_kernel.cc @@ -18,6 +18,7 @@ #include +#include "common/math/math_util.h" #include "graph/common/bcast.h" #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" @@ -34,45 +35,49 @@ const size_t kAddOutputSize = 1; case (DTYPE): \ ret = BCastAdd(op_desc_ptr, input, v_output); \ break; - -#define SET_OVERFLOW_CHECK_SIGNED_CASE(DTYPE, MAX_VALUE, MIN_VALUE) \ - case (DTYPE): \ - if (((y > 0) && (x > ((MAX_VALUE)-y))) || ((y < 0) && (x < ((MIN_VALUE)-y)))) { \ - overflow_flag = true; \ - } \ - break; - -#define SET_OVERFLOW_CHECK_UNSIGNED_CASE(DTYPE, TYPE, MAX_VALUE) \ - case (DTYPE): { \ - TYPE threshold = static_cast(static_cast(MAX_VALUE) - y); \ - if (static_cast(x) > threshold) { \ - overflow_flag = true; \ - } \ - break; \ - } - } // namespace template -bool AddKernel::OverflowCheck(const T &x, const T &y, DataType data_type) { - bool overflow_flag = false; - +Status AddKernel::OverflowCheck(const T &x, const T &y, DataType data_type) { switch (data_type) { - SET_OVERFLOW_CHECK_SIGNED_CASE(DT_INT8, INT8_MAX, INT8_MIN) - SET_OVERFLOW_CHECK_SIGNED_CASE(DT_INT16, INT16_MAX, INT16_MIN) - SET_OVERFLOW_CHECK_SIGNED_CASE(DT_INT32, INT32_MAX, INT32_MIN) - SET_OVERFLOW_CHECK_SIGNED_CASE(DT_INT64, INT64_MAX, INT64_MIN) - SET_OVERFLOW_CHECK_SIGNED_CASE(DT_FLOAT, FLT_MAX, FLT_MIN) - SET_OVERFLOW_CHECK_SIGNED_CASE(DT_DOUBLE, DBL_MAX, DBL_MIN) - SET_OVERFLOW_CHECK_UNSIGNED_CASE(DT_UINT8, uint8_t, UINT8_MAX) - SET_OVERFLOW_CHECK_UNSIGNED_CASE(DT_UINT16, uint16_t, UINT16_MAX) - SET_OVERFLOW_CHECK_UNSIGNED_CASE(DT_UINT32, uint32_t, UINT32_MAX) - SET_OVERFLOW_CHECK_UNSIGNED_CASE(DT_UINT64, uint64_t, UINT64_MAX) + case DT_INT8: + FMK_INT8_ADDCHECK(x, y) + break; + case DT_INT16: + FMK_INT16_ADDCHECK(x, y) + break; + case DT_INT32: + FMK_INT32_ADDCHECK(x, y) + break; + case DT_INT64: + FMK_INT64_ADDCHECK(x, y) + break; + case DT_UINT8: + FMK_UINT8_ADDCHECK(x, y) + break; + case DT_UINT16: + FMK_UINT16_ADDCHECK(x, y) + break; + case DT_UINT32: + FMK_UINT32_ADDCHECK(x, y) + break; + case DT_UINT64: + FMK_UINT64_ADDCHECK(x, y) + break; + case DT_FLOAT16: + FMK_FP16_ADDCHECK(x, y) + break; + case DT_FLOAT: + FMK_FLOAT_ADDCHECK(x, y) + break; + case DT_DOUBLE: + FMK_DOUBLE_ADDCHECK(x, y) + break; default: break; } - return overflow_flag; + return SUCCESS; } template @@ -80,8 +85,8 @@ Status AddKernel::BCastAdd(const OpDescPtr &op_desc_ptr, const std::vector &v_output) { // only broadcast shape BCast bcast; - Status ret = bcast.GenerateBcastInfo(ge::BCast::TransShapeToDimVec(input[kAddFirstInput]->GetTensorDesc()), - ge::BCast::TransShapeToDimVec(input[kAddSecondInput]->GetTensorDesc())); + Status ret = bcast.GenerateBcastInfo(BCast::TransShapeToDimVec(input[kAddFirstInput]->GetTensorDesc()), + BCast::TransShapeToDimVec(input[kAddSecondInput]->GetTensorDesc())); if (ret != SUCCESS) { GELOGE(ret, "Greater broadcasting failed."); return ret; @@ -91,46 +96,33 @@ Status AddKernel::BCastAdd(const OpDescPtr &op_desc_ptr, const std::vector y_indexes; bcast.BCastIndexes(x_indexes, y_indexes); - if (input[kAddFirstInput]->GetData().size() < sizeof(InT)) { - GELOGE(FAILED, "The size of the first input is less than the size of the InT."); - return FAILED; - } auto x1_data = reinterpret_cast(input[kAddFirstInput]->GetData().data()); - - if (input[kAddSecondInput]->GetData().size() < sizeof(InT)) { - GELOGE(FAILED, "The size of the second input is less than the size of the InT."); - return FAILED; - } auto x2_data = reinterpret_cast(input[kAddSecondInput]->GetData().data()); size_t data_num = x_indexes.size(); - InT *data = nullptr; - data = new (std::nothrow) InT[data_num](); - GE_CHECK_NOTNULL(data); + std::unique_ptr buf(new (std::nothrow) InT[data_num]()); + if (buf == nullptr) { + GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed", static_cast(sizeof(InT) * data_num)); + return MEMALLOC_FAILED; + } DataType data_type = input[kAddFirstInput]->GetTensorDesc().GetDataType(); for (size_t i = 0; i < data_num; i++) { auto x_index = *(x1_data + x_indexes[i]); auto y_index = *(x2_data + y_indexes[i]); - if (OverflowCheck(x_index, y_index, data_type)) { + if (OverflowCheck(x_index, y_index, data_type) != SUCCESS) { GELOGE(PARAM_INVALID, "Result of add is overflow."); - GE_DELETE_NEW_ARRAY(data); return PARAM_INVALID; } - data[i] = x_index + y_index; + *(buf.get() + i) = x_index + y_index; } GeTensorPtr output_ptr = MakeShared(op_desc_ptr->GetOutputDesc(kAddFirstOutput)); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "Make shared failed"); - GE_DELETE_NEW_ARRAY(data); return MEMALLOC_FAILED; } - if (output_ptr->SetData(reinterpret_cast(data), data_num * sizeof(InT))) { - GELOGW("GetRange: SetData failed"); - } - GE_DELETE_NEW_ARRAY(data); - + output_ptr->SetData(reinterpret_cast(buf.get()), data_num * sizeof(InT)); output_ptr->MutableTensorDesc().SetDataType(data_type); vector bcast_dims = bcast.GetOutputShape(); output_ptr->MutableTensorDesc().SetShape(GeShape(bcast_dims)); @@ -139,25 +131,22 @@ Status AddKernel::BCastAdd(const OpDescPtr &op_desc_ptr, const std::vector &input, - std::vector &v_output) { +Status AddKernel::AddCheck(const OpDescPtr &op_desc_ptr, const std::vector &input) { if (op_desc_ptr == nullptr) { GELOGE(PARAM_INVALID, "Op_desc_ptr must not be null."); - return NOT_CHANGED; + return PARAM_INVALID; } // check how many inputs if ((input.size() != kAddInputSize) || (op_desc_ptr->GetOutputsSize() != kAddOutputSize)) { GELOGE(PARAM_INVALID, "The number of input for add must be %zu, output number must be %zu.", kAddInputSize, kAddOutputSize); - return NOT_CHANGED; + return PARAM_INVALID; } - // input vector elements must not be null if ((input[kAddFirstInput] == nullptr) || (input[kAddSecondInput] == nullptr)) { GELOGE(PARAM_INVALID, "Input vector elements must not be null."); - return NOT_CHANGED; + return PARAM_INVALID; } - // Inputs must have the same datatype. DataType data_type_0 = input[kAddFirstInput]->GetTensorDesc().GetDataType(); DataType data_type_1 = input[kAddSecondInput]->GetTensorDesc().GetDataType(); @@ -165,18 +154,27 @@ Status AddKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetData().size() == 0) || (input[kAddSecondInput]->GetData().size() == 0)) { GELOGW("Data size of input0 is %zu, input1 is %zu.", input[kAddFirstInput]->GetData().size(), input[kAddSecondInput]->GetData().size()); + return PARAM_INVALID; + } + + return SUCCESS; +} + +Status AddKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector &input, + std::vector &v_output) { + if (AddCheck(op_desc_ptr, input) != SUCCESS) { return NOT_CHANGED; } Status ret = NOT_CHANGED; - switch (data_type_0) { + DataType data_type = input[kAddFirstInput]->GetTensorDesc().GetDataType(); + switch (data_type) { SET_BCAST_ADD_CASE(DT_INT8, int8_t) SET_BCAST_ADD_CASE(DT_INT16, int16_t) SET_BCAST_ADD_CASE(DT_INT32, int32_t) @@ -185,10 +183,11 @@ Status AddKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector &input, + std::vector &v_output) override; + + private: + Status AddCheck(const OpDescPtr &op_desc_ptr, const std::vector &input); + template - bool OverflowCheck(const T &x, const T &y, DataType data_type); + Status OverflowCheck(const T &x, const T &y, DataType data_type); template Status BCastAdd(const OpDescPtr &op_desc_ptr, const std::vector &input, std::vector &v_output); - Status Compute(const ge::OpDescPtr op_desc_ptr, const std::vector &input, - std::vector &v_output) override; }; } // namespace ge #endif // GE_GRAPH_PASSES_FOLDING_KERNEL_ADD_KERNEL_H_ diff --git a/src/ge/graph/passes/folding_kernel/broadcast_args_kernel.cc b/src/ge/graph/passes/folding_kernel/broadcast_args_kernel.cc index b513f737..364fb415 100644 --- a/src/ge/graph/passes/folding_kernel/broadcast_args_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/broadcast_args_kernel.cc @@ -41,18 +41,22 @@ Status BroadcastArgsKernel::Compute(const OpDescPtr op_desc_ptr, const std::vect } // check input size bool size_check = - (op_desc_ptr->GetAllInputsDesc().size() != kBCastArgsInputsSize || input.size() != kBCastArgsInputsSize || - op_desc_ptr->GetAllOutputsDesc().size() != kBCastArgsOutputsSize); + (op_desc_ptr->GetAllInputsDesc().size() != kBCastArgsInputsSize || input.size() != kBCastArgsInputsSize || + op_desc_ptr->GetAllOutputsDesc().size() != kBCastArgsOutputsSize); if (size_check) { - GELOGW("input/output size error. InDesc size:%zu," - "OutDesc size:%zu, in size:%zu ", - op_desc_ptr->GetAllInputsDesc().size(), op_desc_ptr->GetAllOutputsDesc().size(), input.size()); + GELOGW( + "input/output size error. InDesc size:%zu," + "OutDesc size:%zu, in size:%zu ", + op_desc_ptr->GetAllInputsDesc().size(), op_desc_ptr->GetAllOutputsDesc().size(), input.size()); return NOT_CHANGED; } vector x1_dims; vector x2_dims; - DataType data_type = op_desc_ptr->GetInputDesc(0).GetDataType(); + const auto &op_in_desc = op_desc_ptr->MutableInputDesc(0); + GE_CHECK_NOTNULL(op_in_desc); + ; + DataType data_type = op_in_desc->GetDataType(); bool result = (OpUtils::GetShapeDataFromConstTensor(input[0], data_type, x1_dims) == SUCCESS) && (OpUtils::GetShapeDataFromConstTensor(input[1], data_type, x2_dims) == SUCCESS); if (!result) { diff --git a/src/ge/graph/passes/folding_kernel/cast_kernel.cc b/src/ge/graph/passes/folding_kernel/cast_kernel.cc index 936f6b5c..bcd26f70 100644 --- a/src/ge/graph/passes/folding_kernel/cast_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/cast_kernel.cc @@ -57,22 +57,21 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetOutputDesc(0); GeTensorDesc op_desc_in = op_desc_ptr->GetInputDesc(0); auto src_data_type = op_desc_in.GetDataType(); - auto src_shape = op_desc_in.GetShape(); - auto src_format = op_desc_in.GetFormat(); - auto data_type = op_desc.GetDataType(); - auto data_shape = op_desc.GetShape(); + auto src_shape = op_desc_in.GetShape(); + auto src_format = op_desc_in.GetFormat(); + auto data_type = op_desc.GetDataType(); + auto data_shape = op_desc.GetShape(); auto data_format = op_desc.GetFormat(); - GELOGD("current node %s, format %s, input shape %s, data type %s, weight format %s, shape %s, data type %s. " - "output format %s, shape %s, data type %s", op_desc_ptr->GetName().c_str(), - TypeUtils::FormatToSerialString(src_format).c_str(), - formats::ShapeToString(src_shape).c_str(), - TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::FormatToSerialString(const_weight_ptr->GetTensorDesc().GetFormat()).c_str(), - formats::ShapeToString(const_weight_ptr->GetTensorDesc().GetShape()).c_str(), - TypeUtils::DataTypeToSerialString(const_weight_ptr->GetTensorDesc().GetDataType()).c_str(), - TypeUtils::FormatToSerialString(data_format).c_str(), - formats::ShapeToString(data_shape).c_str(), - TypeUtils::DataTypeToSerialString(data_type).c_str()); + GELOGD( + "Current node %s, format %s, input shape %s, data type %s, weight format %s, shape %s, data type %s. " + "output format %s, shape %s, data type %s", + op_desc_ptr->GetName().c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), + formats::ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(), + TypeUtils::FormatToSerialString(const_weight_ptr->GetTensorDesc().GetFormat()).c_str(), + formats::ShapeToString(const_weight_ptr->GetTensorDesc().GetShape()).c_str(), + TypeUtils::DataTypeToSerialString(const_weight_ptr->GetTensorDesc().GetDataType()).c_str(), + TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); GE_CHECK_SIZE(const_weight_ptr->GetData().GetSize()); auto src_data_size = src_shape.GetShapeSize(); @@ -84,17 +83,16 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector(src_data_size), src_data_type, data_type}; formats::TransResult trans_result; GELOGD("Trans data type from %s to %s, shape %s, data size %ld", - TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::DataTypeToSerialString(data_type).c_str(), + TypeUtils::DataTypeToSerialString(src_data_type).c_str(), TypeUtils::DataTypeToSerialString(data_type).c_str(), formats::ShapeToString(src_shape).c_str(), src_data_size); - if (src_format != data_format || src_shape.GetDims() != data_shape.GetDims() || - !formats::IsTransDataTypeSupport(cast_args)) { + if ((src_format != data_format) || (src_shape.GetDims() != data_shape.GetDims()) || + (!formats::IsTransDataTypeSupport(cast_args))) { GELOGW("Transfer from data type %s to %s, format %s to %s, shape %s to %s is not supported", TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::DataTypeToSerialString(data_type).c_str(), - TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(data_format).c_str(), - formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(data_shape).c_str()); + TypeUtils::DataTypeToSerialString(data_type).c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), + TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(src_shape).c_str(), + formats::ShapeToString(data_shape).c_str()); return NOT_CHANGED; } if (!KernelUtils::CheckSizeForTransOp(const_weight_ptr, op_desc_ptr)) { @@ -104,8 +102,8 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector(N); i++) { buf[concat_dim] = offset; - // generate output - GeTensorPtr output_ptr = MakeShared(); + // generate output, index 0 can always gets a GeTensorDesc object from any OpDescPtr. + auto output_tensor_desc = op_desc_ptr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "Failed to fold node %s, out of memeory", op_desc_ptr->GetName().c_str()); return NOT_CHANGED; @@ -86,7 +87,9 @@ Status ConcatOffsetKernel::Compute(const OpDescPtr op_desc_ptr, const vectorGetTensorDesc().GetShape().GetDim(concat_dim); + const int32_t *input_shape = + reinterpret_cast(input[i + kConcatOffsetInputIndexOne]->GetData().data()); + int64_t input_dim = input_shape[concat_dim]; // this index is valid, checked before if (input_dim > (INT32_MAX - offset)) { GELOGE(PARAM_INVALID, " %d and %ld addition can result in overflow!.", offset, input_dim); return INTERNAL_ERROR; diff --git a/src/ge/graph/passes/folding_kernel/concat_v2_kernel.cc b/src/ge/graph/passes/folding_kernel/concat_v2_kernel.cc index 0879ff86..9d06b612 100644 --- a/src/ge/graph/passes/folding_kernel/concat_v2_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/concat_v2_kernel.cc @@ -34,13 +34,14 @@ const size_t kConcatV2InputNum = 3; const std::set concatv2_supported_type = {DT_INT32, DT_FLOAT}; template -void GetOutputData(std::vector &y_data, int loop, size_t &input_size, const std::vector &input) { - for (int i = 0; i < loop; i++) { +void GetOutputData(std::vector &y_data, int64_t loop, size_t &input_size, + const std::vector &input) { + for (int64_t i = 0; i < loop; i++) { for (size_t k = 0; k < input_size; k++) { GeShape datak_shape = input.at(k)->GetTensorDesc().GetShape(); const T *datak = reinterpret_cast(input.at(k)->GetData().data()); - int gapk = datak_shape.GetShapeSize() / loop; // [2,3] is 6/loop - for (int j = 0; j < gapk; j++) { + int64_t gapk = datak_shape.GetShapeSize() / loop; // [2,3] is 6/loop + for (int64_t j = 0; j < gapk; j++) { y_data.push_back(datak[j + gapk * i]); } } @@ -82,14 +83,16 @@ Status ConcatV2Kernel::Compute(const ge::OpDescPtr op_desc_ptr, const vector y_data_int32_t; std::vector y_data_float; - GeTensorPtr output_ptr = MakeShared(); + // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. + auto output_tensor_desc = op_desc_ptr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "MakeShared failed."); return MEMALLOC_FAILED; } GeShape data0_shape = tensor0->GetTensorDesc().GetShape(); - int loop = 1; + int64_t loop = 1; for (int i = 0; i < tidx; i++) { loop *= data0_shape.GetDim(i); } diff --git a/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.cc b/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.cc index 479711c8..94576ed1 100644 --- a/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.cc @@ -16,180 +16,199 @@ #include "graph/passes/folding_kernel/dynamic_stitch_kernel.h" +#include #include #include "common/fp16_t.h" #include "common/ge_inner_error_codes.h" +#include "common/math/math_util.h" #include "common/op/ge_op_utils.h" #include "common/types.h" #include "framework/common/debug/ge_log.h" -#include "graph/passes/folding_kernel/kernel_utils.h" #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" namespace ge { namespace { const int kDoubleAttrN = 2; +const int kFirstOutputDescIdx = 0; +const int kMergedShapeSecondDim = 1; +const std::set kSupportedTypeSet = {DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, + DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}; } // namespace Status DynamicStitchKernel::Compute(const OpDescPtr op_desc_ptr, const vector &input, vector &v_output) { - GELOGI("DynamicStitch Kernel in."); + GELOGD("DynamicStitch Kernel in."); Status validate_ret = ValidateParams(op_desc_ptr, input); if (validate_ret != SUCCESS) { GELOGW("Dynamic stitch kernel params validate failed."); - return validate_ret; + return NOT_CHANGED; } - GE_CHECK_NOTNULL(input[n_]); - auto data_type = input[n_]->GetTensorDesc().GetDataType(); - Status ret; - switch (data_type) { - case DT_INT8: - ret = GenData(input, v_output); - break; - case DT_UINT8: - ret = GenData(input, v_output); - break; - case DT_INT16: - ret = GenData(input, v_output); - break; - case DT_UINT16: - ret = GenData(input, v_output); - break; - case DT_INT32: - ret = GenData(input, v_output); - break; - case DT_INT64: - ret = GenData(input, v_output); - break; - case DT_BOOL: - ret = GenData(input, v_output); - break; - case DT_FLOAT16: - ret = GenData(input, v_output); - break; - case DT_FLOAT: - ret = GenData(input, v_output); - break; - case DT_DOUBLE: - ret = GenData(input, v_output); - break; - default: - ret = NOT_CHANGED; - GELOGI("Dynamic stitch op not support data type of %s.", TypeUtils::DataTypeToSerialString(data_type).c_str()); - break; + // OutputDesc size is not null, validated before + GeTensorPtr output_ptr = MakeShared(op_desc_ptr->GetOutputDesc(kFirstOutputDescIdx)); + if (output_ptr == nullptr) { + GELOGW("Fail to malloc output."); + return NOT_CHANGED; } + Status ret = GenData(input, output_ptr); if (ret != SUCCESS) { GELOGW("Dynamic stitch folding failed."); - return ret; + return NOT_CHANGED; } - GELOGI("Dynamic stitch end."); + v_output.push_back(output_ptr); + GELOGD("Dynamic stitch end."); return SUCCESS; } Status DynamicStitchKernel::ValidateParams(const OpDescPtr &op_desc_ptr, const std::vector &input) { if (op_desc_ptr == nullptr) { - GELOGE(PARAM_INVALID, "input opdesc is nullptr."); + GELOGE(PARAM_INVALID, "Input op_desc is nullptr."); + return PARAM_INVALID; + } + if (op_desc_ptr->GetOutputsSize() == 0) { + GELOGE(PARAM_INVALID, "Current output_desc is empty."); return PARAM_INVALID; } // validate input - // input[0]~input[N-1] is indices, input[N]~input[2N-1] is datas + // input[0]~input[N-1] is indices, input[N]~input[2N-1] is data if (input.empty()) { - GELOGI("Input is empty.Ignore dynamic stitch kernel."); + GELOGI("Input is empty. Ignore dynamic stitch kernel."); return NOT_CHANGED; } + for (const auto &in : input) { + if (in == nullptr) { + GELOGE(PARAM_INVALID, "input is nullptr."); + return PARAM_INVALID; + } + } // validate attrs - if (!(AttrUtils::GetInt(op_desc_ptr, DYNAMIC_STITCH_ATTR_NAME_NUM, n_))) { - GELOGW("Attr %s is not exist.", DYNAMIC_STITCH_ATTR_NAME_NUM.c_str()); + if (!(AttrUtils::GetInt(op_desc_ptr, ATTR_NAME_N, n_))) { + GELOGW("Attr %s is not exist.", ATTR_NAME_N.c_str()); return NOT_CHANGED; } // validate attr N and input.size - if ((kDoubleAttrN * n_) != static_cast(input.size())) { - GELOGW("Input size is not not match with attr N. Ignore dynamic stitch kernel."); + if ((kDoubleAttrN * n_) > static_cast(input.size())) { + GELOGW("Input size %zu is not not match with attr %d. Ignore dynamic stitch kernel.", input.size(), n_); + return NOT_CHANGED; + } + // validate supported datatype + DataType data_type = input[n_]->GetTensorDesc().GetDataType(); + if (kSupportedTypeSet.find(data_type) == kSupportedTypeSet.end()) { + GELOGW("Input data_type %s is not supported. Please check IR definition. Ignore dynamic stitch kernel.", + TypeUtils::DataTypeToSerialString(data_type).c_str()); return NOT_CHANGED; } return SUCCESS; } -template -void DynamicStitchKernel::ComputeMergedShape(const vector &input, GeShape &merged_shape, - map &indice_data_mapping) { - // data[i].shape = indices[i].shape + constant - size_t indice_dim = input[0]->GetTensorDesc().GetShape().GetDimNum(); - // index n_ for input is less than size of input - GeShape input_n_shape = input[n_]->GetTensorDesc().GetShape(); - int64_t dim_offset = (input_n_shape.GetDimNum() == indice_dim) ? 0 : input_n_shape.GetDim(indice_dim); - - int64_t merged_first_dim = 0; - vector indice_dims; +void DynamicStitchKernel::ComputeMergedShape(const vector &input, GeShape &merged_shape) { + // Safety note: index [1~2*n_] for input is valid, and all input is not null, validated in ValidateParams + // merged.shape = [max(indices)] + step + // 1. Compute merged first dim, which is the max index. + int32_t merged_first_dim = 0; + int64_t indices_shape_size = 0; for (int i = 0; i < n_; i++) { - // all index for input is less than size of input - indice_dims = input[i]->GetTensorDesc().GetShape().GetDims(); - int32_t *input_indice = const_cast(reinterpret_cast(input[i]->GetData().data())); - T *input_data = const_cast(reinterpret_cast(input[i + n_]->GetData().data())); - // scaler indice has one element - if (indice_dims.empty()) { - // if indice repeated, need new data replace old data - indice_data_mapping[input_indice[0]] = input_data[0]; - merged_first_dim = (merged_first_dim > input_indice[0]) ? merged_first_dim : input_indice[0]; - continue; - } - // vector indice element mapping - for (const auto &dim : indice_dims) { - for (auto j = 0; j < dim; j++) { - // if indice repeated, need new data replace old data - indice_data_mapping[input_indice[j]] = input_data[j]; - merged_first_dim = (merged_first_dim > input_indice[j]) ? merged_first_dim : input_indice[j]; - } + indices_shape_size = input[i]->GetTensorDesc().GetShape().GetShapeSize(); + indices_shape_size = indices_shape_size == 0 ? 1 : indices_shape_size; + const int32_t *input_indices = reinterpret_cast(input[i]->GetData().data()); + for (int64_t j = 0; j < indices_shape_size; j++) { + merged_first_dim = std::max(merged_first_dim, input_indices[j]); } } - ++merged_first_dim; + // 2. Compute step, which is follow : step = data[t].shape - indices[t].shape + size_t indices_dim_num = input[0]->GetTensorDesc().GetShape().GetDimNum(); + GeShape data_shape = input[n_]->GetTensorDesc().GetShape(); + int64_t step = (data_shape.GetDimNum() == indices_dim_num) ? 0 : data_shape.GetDim(indices_dim_num); - vector merged_dim_vec = {merged_first_dim}; - if (dim_offset != 0) { - merged_dim_vec.emplace_back(dim_offset); - GELOGI("merged_shape is [ %ld, %ld].", merged_first_dim, dim_offset); + vector merged_dim_vec = {merged_first_dim + 1}; + if (step > 0) { + merged_dim_vec.emplace_back(step); + GELOGD("merged_shape is [ %ld, %ld].", merged_first_dim, step); } merged_shape = GeShape(merged_dim_vec); - GELOGI("merged_shape is [ %ld ].", merged_first_dim); + GELOGD("merged_shape is [ %ld ].", merged_first_dim); } -template -Status DynamicStitchKernel::GenData(const vector &input, vector &v_output) { +Status DynamicStitchKernel::GenData(const vector &input, GeTensorPtr &output_ptr) { + // Safety note: index [1~2*n_] for input is valid, and all input is not null, validated in ValidateParams GeShape merged_shape; - map indice_data_mapping; - ComputeMergedShape(input, merged_shape, indice_data_mapping); + ComputeMergedShape(input, merged_shape); + auto data_type = input[n_]->GetTensorDesc().GetDataType(); - int64_t output_size = merged_shape.GetShapeSize(); - unique_ptr buf(new (std::nothrow) T[output_size]()); + // 1.calc output data size + auto output_size = merged_shape.GetShapeSize(); + int64_t data_size = GetSizeByDataType(data_type); + auto step = merged_shape.GetDim(kMergedShapeSecondDim); + if (!CheckInt64MulOverflow(output_size, data_size) || !CheckInt64MulOverflow(step, data_size)) { + GELOGW("Check int64 mul overflow failed. Output_size is %ld, data_size is %ld, step is %ld.", output_size, + data_size, step); + return NOT_CHANGED; + } + auto allowance = output_size * data_size; + auto data_unit = step > 0 ? step * data_size : data_size; + // 2.allocate memery for output + std::unique_ptr buf(new (std::nothrow) uint8_t[allowance]); if (buf == nullptr) { - GELOGE(MEMALLOC_FAILED, "new buf failed"); + GELOGE(MEMALLOC_FAILED, "new buffer failed"); return INTERNAL_ERROR; } - for (const auto &indice_data : indice_data_mapping) { - auto index = indice_data.first; - buf[index] = indice_data.second; + // 3.copy data from input_data along with the sequence of input_indices + Status stitch_ret = StitchDataFollowIndices(data_unit, input, allowance, buf); + if (stitch_ret != SUCCESS) { + GELOGW("Stitch data follow index failed."); + return NOT_CHANGED; } - GeTensorPtr output_ptr = MakeShared(); - if (output_ptr == nullptr) { - GELOGW("Fail to malloc output."); + output_ptr->MutableTensorDesc().SetDataType(data_type); + output_ptr->MutableTensorDesc().SetShape(merged_shape); + Status ret = output_ptr->SetData(buf.get(), allowance); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "set data failed"); return NOT_CHANGED; } - auto dtype = input[n_]->GetTensorDesc().GetDataType(); - output_ptr->MutableTensorDesc().SetDataType(dtype); - output_ptr->MutableTensorDesc().SetShape(merged_shape); + return SUCCESS; +} - uint32_t length = 1; - if (!TypeUtils::GetDataTypeLength(dtype, length)) { - GELOGW("Can't GetDataTypeLength of data_type: %s", TypeUtils::DataTypeToSerialString(dtype).c_str()); - return NOT_CHANGED; +Status DynamicStitchKernel::StitchDataFollowIndices(int64_t data_unit, const vector &input, + int64_t allowance, std::unique_ptr &buf) { + // Safety note: index [1~2*n_] for input is valid, and all input is not null, validated in ValidateParams + int64_t dst_offset = 0; + int64_t src_offset = 0; + std::set indices_set; + for (int i = 0; i < n_; i++) { + auto indices_shape_size = input[i]->GetTensorDesc().GetShape().GetShapeSize(); + // to normalize logic, assume scalar as vector with shape of [1]. + indices_shape_size = (indices_shape_size == 0) ? 1 : indices_shape_size; + // all index for input is less than size of input + const int32_t *input_indices = reinterpret_cast(input[i]->GetData().data()); + const uint8_t *input_data = input[i + n_]->GetData().data(); + for (int64_t j = 0; j < indices_shape_size; j++) { + // if index repeated, need new data replace old data , so give more allowance + if (indices_set.find(input_indices[j]) != indices_set.end()) { + if (ge::CheckInt64AddOverflow(input_indices[j], data_unit) != SUCCESS) { + GELOGW("Check int64 mul overflow failed. Indices is %ld, data_unit is %ld.", input_indices[j], data_unit); + return NOT_CHANGED; + } + allowance += data_unit; + } + indices_set.insert(input_indices[j]); + if (!CheckInt64MulOverflow(input_indices[j], data_unit)) { + GELOGW("Check int64 mul overflow failed. Indices is %ld, data_unit is %ld.", input_indices[j], data_unit); + return NOT_CHANGED; + } + dst_offset = input_indices[j] * data_unit; + src_offset = j * data_unit; + auto protected_size = + allowance < static_cast(SECUREC_MEM_MAX_LEN) ? allowance : static_cast(SECUREC_MEM_MAX_LEN); + auto ret = memcpy_s(buf.get() + dst_offset, protected_size, input_data + src_offset, data_unit); + if (ret != EOK) { + GELOGW("Memory copy failed."); + return NOT_CHANGED; + } + allowance -= data_unit; + } } - GE_IF_BOOL_EXEC(output_ptr->SetData(reinterpret_cast(buf.get()), - static_cast(output_size * length)) != GRAPH_SUCCESS, - GELOGE(INTERNAL_ERROR, "set data failed"); - return NOT_CHANGED); - v_output.push_back(output_ptr); return SUCCESS; } diff --git a/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.h b/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.h index 28de381f..512c731b 100644 --- a/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.h +++ b/src/ge/graph/passes/folding_kernel/dynamic_stitch_kernel.h @@ -30,12 +30,10 @@ class DynamicStitchKernel : public Kernel { private: Status ValidateParams(const OpDescPtr &attr, const std::vector &input); - template - void ComputeMergedShape(const vector &input, GeShape &merged_shape, - map &indice_data_mapping); - template - Status GenData(const vector &input, vector &v_output); - + void ComputeMergedShape(const vector &input, GeShape &merged_shape); + Status GenData(const vector &input, GeTensorPtr &output_ptr); + Status StitchDataFollowIndices(int64_t data_unit, const vector &input, int64_t allowance, + std::unique_ptr &buf); int n_; // data input number }; } // namespace ge diff --git a/src/ge/graph/passes/folding_kernel/empty_kernel.cc b/src/ge/graph/passes/folding_kernel/empty_kernel.cc index 1e9ced22..1b135b9c 100644 --- a/src/ge/graph/passes/folding_kernel/empty_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/empty_kernel.cc @@ -43,8 +43,8 @@ Status EmptyKernel::EmptyCheck(const OpDescPtr &op_desc_ptr, const std::vectorGetAllInputsDesc().size() != kEmptyInputsSize) || (input.size() != kEmptyInputsSize) || - (op_desc_ptr->GetAllOutputsDesc().size() != kEmptyOutputsSize)); + ((op_desc_ptr->GetAllInputsDesc().size() != kEmptyInputsSize) || (input.size() != kEmptyInputsSize) || + (op_desc_ptr->GetAllOutputsDesc().size() != kEmptyOutputsSize)); if (size_check) { GELOGE(PARAM_INVALID, "Input/Output size error. InDesc size:%zu, OutDesc size:%zu, in size:%zu ", op_desc_ptr->GetAllInputsDesc().size(), op_desc_ptr->GetAllOutputsDesc().size(), input.size()); @@ -109,7 +109,7 @@ Status EmptyKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetTensorDesc().GetDataType(); ret = PARAM_INVALID; switch (data_type) { -#define CASE(dtype, type) \ - case dtype: \ - ret = KernelUtils::GenData(fill_size, *reinterpret_cast(value->GetData().data()), \ - output_ptr); \ +#define CASE(dtype, type) \ + case dtype: \ + ret = KernelUtils::GenData(fill_size, *reinterpret_cast(value->GetData().data()), output_ptr); \ break; CASE(DT_FLOAT, float) CASE(DT_FLOAT16, fp16_t) diff --git a/src/ge/graph/passes/folding_kernel/floordiv_kernel.cc b/src/ge/graph/passes/folding_kernel/floordiv_kernel.cc index d411e034..81595822 100644 --- a/src/ge/graph/passes/folding_kernel/floordiv_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/floordiv_kernel.cc @@ -41,7 +41,7 @@ Status FloorDivKernel::FloorDivCheck(const OpDescPtr &op_desc_ptr, const std::vector &input) const { // check input size if (op_desc_ptr == nullptr) { - GELOGE(PARAM_INVALID, "input opdesc is nullptr."); + GELOGW("Input opdesc is nullptr."); return PARAM_INVALID; } if (input.size() != kFloorDivInputSize) { @@ -256,7 +256,9 @@ Status FloorDivKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector(); + // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. + auto output_tensor_desc = op_desc_ptr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "make_shared ge::GeTensor failed, node name %s.", op_desc_ptr->GetName().c_str()); return NOT_CHANGED; diff --git a/src/ge/graph/passes/folding_kernel/floordiv_kernel.h b/src/ge/graph/passes/folding_kernel/floordiv_kernel.h index a692ff67..c8505731 100644 --- a/src/ge/graph/passes/folding_kernel/floordiv_kernel.h +++ b/src/ge/graph/passes/folding_kernel/floordiv_kernel.h @@ -47,4 +47,4 @@ class FloorDivKernel : public Kernel { }; } // namespace ge -#endif // GE_GRAPH_PASSES_FOLDING_KERNEL_FLOORDIV_KERNEL_H_ \ No newline at end of file +#endif // GE_GRAPH_PASSES_FOLDING_KERNEL_FLOORDIV_KERNEL_H_ diff --git a/src/ge/graph/passes/folding_kernel/floormod_kernel.cc b/src/ge/graph/passes/folding_kernel/floormod_kernel.cc index b5b661bb..d7fb3b1c 100644 --- a/src/ge/graph/passes/folding_kernel/floormod_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/floormod_kernel.cc @@ -26,6 +26,7 @@ #include "graph/common/bcast.h" #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" + namespace ge { namespace { const size_t kFloorModInputX = 0; @@ -62,14 +63,14 @@ Status CheckYIsZero(T const &y, DataType &type) { } // mod(x,y) equals to x - y * floor(x/y) -#define DEFINE_FUNC_BY_TYPE(TYPE) \ - std::function func_##TYPE = []( \ - TYPE const &a, TYPE const &b, DataType &type, Status &ret) -> TYPE { \ - ret = CheckYIsZero(b, type); \ - if (ret != SUCCESS) { \ - return static_cast(0); \ - } \ - return (a - b * FloorDiv(a, b)); \ +#define DEFINE_FUNC_BY_TYPE(TYPE) \ + std::function func_##TYPE = \ + [](TYPE const &a, TYPE const &b, DataType &type, Status &ret) -> TYPE { \ + ret = CheckYIsZero(b, type); \ + if (ret != SUCCESS) { \ + return static_cast(0); \ + } \ + return (a - b * FloorDiv(a, b)); \ }; #define SET_BCAST_COMPUTE_CASE(DTYPE, TYPE) \ diff --git a/src/ge/graph/passes/folding_kernel/gather_v2_kernel.cc b/src/ge/graph/passes/folding_kernel/gather_v2_kernel.cc index 732e0b53..92c9e035 100644 --- a/src/ge/graph/passes/folding_kernel/gather_v2_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/gather_v2_kernel.cc @@ -29,6 +29,8 @@ #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" +using ge::fp16_t; + namespace ge { namespace { const size_t kGatherV2InputIndexZero = 0; diff --git a/src/ge/graph/passes/folding_kernel/greater_kernel.cc b/src/ge/graph/passes/folding_kernel/greater_kernel.cc index 816d3d05..4b4caa3a 100644 --- a/src/ge/graph/passes/folding_kernel/greater_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/greater_kernel.cc @@ -31,6 +31,7 @@ using domi::Status; using domi::SUCCESS; +using ge::fp16_t; namespace ge { namespace { diff --git a/src/ge/graph/passes/folding_kernel/kernel_utils.cc b/src/ge/graph/passes/folding_kernel/kernel_utils.cc index c9568d37..9448b232 100644 --- a/src/ge/graph/passes/folding_kernel/kernel_utils.cc +++ b/src/ge/graph/passes/folding_kernel/kernel_utils.cc @@ -55,7 +55,7 @@ Status KernelUtils::CheckDimensionNodeInfo(const NodePtr &node_ptr) { GELOGE(PARAM_INVALID, "dim node must be const op"); return PARAM_INVALID; } - ConstGeTensorPtr input_dim = const_ge_tensor.at(0); + const ConstGeTensorPtr &input_dim = const_ge_tensor.at(0); if (input_dim->GetData().size() == 0) { GELOGE(PARAM_INVALID, "dim data size is 0"); return PARAM_INVALID; @@ -74,7 +74,9 @@ bool KernelUtils::CheckFormatSupported(const NodePtr &node_ptr) { GELOGE(FAILED, "op_desc is null"); return false; } - Format fmt = op_desc->GetInputDesc(kDimensionShapeIndex).GetFormat(); + const auto &input_desc = op_desc->MutableInputDesc(kDimensionShapeIndex); + GE_CHECK_NOTNULL_EXEC(input_desc, return false); + Format fmt = input_desc->GetFormat(); if (fmt == FORMAT_NC1HWC0 || fmt == FORMAT_FRACTAL_Z) { GELOGW("invalid format, fmt: %s", TypeUtils::FormatToSerialString(fmt).c_str()); return false; @@ -83,18 +85,18 @@ bool KernelUtils::CheckFormatSupported(const NodePtr &node_ptr) { return true; } -bool KernelUtils::CheckSizeForTransOp(const ge::ConstGeTensorPtr &const_weight_ptr, - const ge::OpDescPtr &op_desc_ptr) { +bool KernelUtils::CheckSizeForTransOp(const ge::ConstGeTensorPtr &const_weight_ptr, const ge::OpDescPtr &op_desc_ptr) { if (const_weight_ptr == nullptr || op_desc_ptr == nullptr) { GELOGE(FAILED, "parameter invalid"); return false; } auto data_size = const_weight_ptr->GetData().GetSize(); - - DataType data_type = op_desc_ptr->GetInputDesc(0).GetDataType(); - GeShape data_shape = op_desc_ptr->GetInputDesc(0).GetShape(); - Format data_format = op_desc_ptr->GetInputDesc(0).GetFormat(); - auto shape_size = op_desc_ptr->GetInputDesc(0).GetShape().GetShapeSize(); + const auto &input_desc = op_desc_ptr->MutableInputDesc(0); + GE_CHECK_NOTNULL_EXEC(input_desc, return false); + DataType data_type = input_desc->GetDataType(); + GeShape data_shape = input_desc->GetShape(); + Format data_format = input_desc->GetFormat(); + auto shape_size = input_desc->GetShape().GetShapeSize(); int64_t cal_size = 0; auto ret = TensorUtils::CalcTensorMemSize(data_shape, data_format, data_type, cal_size); diff --git a/src/ge/graph/passes/folding_kernel/maximum_kernel.cc b/src/ge/graph/passes/folding_kernel/maximum_kernel.cc index 7f376019..9dd84f0a 100644 --- a/src/ge/graph/passes/folding_kernel/maximum_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/maximum_kernel.cc @@ -29,6 +29,8 @@ #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" +using ge::fp16_t; + namespace ge { namespace { const size_t kMaximumInputNum = 2; @@ -166,11 +168,11 @@ Status MaximumKernel::MaximumCheck(const std::vector &input) { } ConstGeTensorPtr input_x1 = input.at(kMaximumFirstInput); ConstGeTensorPtr input_x2 = input.at(kMaximumSecondInput); - GE_CHECK_NOTNULL(input_x2); + GE_CHECK_NOTNULL(input_x1); GE_CHECK_NOTNULL(input_x2); // check whether there is data in Tensor - if (input_x1->GetData().size() == 0 || input_x2->GetData().size() == 0) { + if ((input_x1->GetData().size() == 0) || (input_x2->GetData().size() == 0)) { GELOGI("Check data size fail. x1: %zu, x2: %zu", input_x1->GetData().size(), input_x2->GetData().size()); return NOT_CHANGED; } diff --git a/src/ge/graph/passes/folding_kernel/mul_kernel.cc b/src/ge/graph/passes/folding_kernel/mul_kernel.cc index 0c33ae53..4ca740d1 100644 --- a/src/ge/graph/passes/folding_kernel/mul_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/mul_kernel.cc @@ -31,42 +31,83 @@ namespace ge { namespace { -const std::set mul_supported_type = {DT_INT32, DT_UINT32}; - +const std::set kMulSupportedType = {DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, + DT_UINT32, DT_UINT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}; template -Status IsOverflow(T const &a, T const &b, DataType &type) { +Status OverflowCheck(T const &x, T const &y, DataType &type) { switch (type) { + case DT_INT8: + FMK_INT8_MULCHECK(x, y) + break; + case DT_INT16: + FMK_INT16_MULCHECK(x, y) + break; case DT_INT32: - return CheckInt32MulOverflow(a, b); + FMK_INT32_MULCHECK(x, y) + break; + case DT_INT64: + FMK_INT64_MULCHECK(x, y) + break; + case DT_UINT8: + FMK_UINT8_MULCHECK(x, y) + break; + case DT_UINT16: + FMK_UINT16_MULCHECK(x, y) + break; case DT_UINT32: - return CheckUint32MulOverflow(a, b); + FMK_UINT32_MULCHECK(x, y) + break; + case DT_UINT64: + FMK_UINT64_MULCHECK(x, y) + break; + case DT_FLOAT16: + FMK_FP16_MULCHECK(x, y) + break; + case DT_FLOAT: + FMK_FLOAT_MULCHECK(x, y) + break; + case DT_DOUBLE: + FMK_DOUBLE_MULCHECK(x, y) + break; default: - return FAILED; + break; } + + return SUCCESS; } #define DEFINE_FUNC_WITH_STATUS_BY_TYPE(TYPE) \ std::function func_##TYPE = \ [](TYPE const &a, TYPE const &b, DataType &type, Status &ret) -> TYPE { \ - ret = IsOverflow(a, b, type); \ + ret = OverflowCheck(a, b, type); \ if (ret != SUCCESS) { \ + GELOGE(PARAM_INVALID, "Result of mul is overflow."); \ return static_cast(0); \ } \ - return a * b; \ + return static_cast(a) * static_cast(b); \ }; -#define SET_BCAST_COMPUTE_CASE(DTYPE, TYPE) \ - case DTYPE: \ - ret = bcast.BCastComputeCheck(input, y_data_##TYPE, func_##TYPE); \ +#define SET_BCAST_COMPUTE_CASE(DTYPE, TYPE) \ + case DTYPE: \ + ret = bcast.BCastComputeCheck(input, y_data_##TYPE##_, func_##TYPE); \ break; -#define SET_OUTPUT(DTYPE, TYPE) \ - case DTYPE: \ - (void)output_ptr->SetData(reinterpret_cast(y_data_##TYPE.data()), y_data_##TYPE.size() * length); \ +#define SET_OUTPUT(DTYPE, TYPE) \ + case DTYPE: \ + (void)output_ptr->SetData(reinterpret_cast(y_data_##TYPE##_.data()), y_data_##TYPE##_.size() * length); \ break; // [no need to check result] +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int8_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int16_t) DEFINE_FUNC_WITH_STATUS_BY_TYPE(int32_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int64_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint8_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint16_t) DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint32_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint64_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(fp16_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(float) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(double) } // namespace Status MulKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector &input, @@ -81,13 +122,20 @@ Status MulKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector y_data_int32_t; - std::vector y_data_uint32_t; DataType data_type = input[0]->GetTensorDesc().GetDataType(); BCast bcast; switch (data_type) { + SET_BCAST_COMPUTE_CASE(DT_INT8, int8_t) + SET_BCAST_COMPUTE_CASE(DT_INT16, int16_t) SET_BCAST_COMPUTE_CASE(DT_INT32, int32_t) + SET_BCAST_COMPUTE_CASE(DT_INT64, int64_t) + SET_BCAST_COMPUTE_CASE(DT_UINT8, uint8_t) + SET_BCAST_COMPUTE_CASE(DT_UINT16, uint16_t) SET_BCAST_COMPUTE_CASE(DT_UINT32, uint32_t) + SET_BCAST_COMPUTE_CASE(DT_UINT64, uint64_t) + SET_BCAST_COMPUTE_CASE(DT_FLOAT16, fp16_t) + SET_BCAST_COMPUTE_CASE(DT_FLOAT, float) + SET_BCAST_COMPUTE_CASE(DT_DOUBLE, double) default: ret = NOT_CHANGED; break; @@ -114,8 +162,17 @@ Status MulKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorMutableTensorDesc().SetShape(GeShape(bcast.GetOutputShape())); // only return GRAPH_SUCCESS here switch (data_type) { + SET_OUTPUT(DT_INT8, int8_t) + SET_OUTPUT(DT_INT16, int16_t) SET_OUTPUT(DT_INT32, int32_t) + SET_OUTPUT(DT_INT64, int64_t) + SET_OUTPUT(DT_UINT8, uint8_t) + SET_OUTPUT(DT_UINT16, uint16_t) SET_OUTPUT(DT_UINT32, uint32_t) + SET_OUTPUT(DT_UINT64, uint64_t) + SET_OUTPUT(DT_FLOAT16, fp16_t) + SET_OUTPUT(DT_FLOAT, float) + SET_OUTPUT(DT_DOUBLE, double) default: break; } @@ -151,7 +208,7 @@ Status MulKernel::MulCheck(const std::vector &input) { } // check if input data type is supported - if (mul_supported_type.find(type) == mul_supported_type.end()) { + if (kMulSupportedType.find(type) == kMulSupportedType.end()) { GELOGI("Mul does not support this Data type: %s", TypeUtils::DataTypeToSerialString(type).c_str()); return NOT_CHANGED; } diff --git a/src/ge/graph/passes/folding_kernel/mul_kernel.h b/src/ge/graph/passes/folding_kernel/mul_kernel.h index 3116aee8..e7c74c41 100644 --- a/src/ge/graph/passes/folding_kernel/mul_kernel.h +++ b/src/ge/graph/passes/folding_kernel/mul_kernel.h @@ -21,6 +21,7 @@ #include "graph/ge_tensor.h" #include "inc/kernel.h" +#include "common/fp16_t.h" namespace ge { class MulKernel : public Kernel { @@ -30,6 +31,17 @@ class MulKernel : public Kernel { private: Status MulCheck(const std::vector &input); + std::vector y_data_int8_t_; + std::vector y_data_int16_t_; + std::vector y_data_int32_t_; + std::vector y_data_int64_t_; + std::vector y_data_uint8_t_; + std::vector y_data_uint16_t_; + std::vector y_data_uint32_t_; + std::vector y_data_uint64_t_; + std::vector y_data_fp16_t_; + std::vector y_data_float_; + std::vector y_data_double_; }; } // namespace ge diff --git a/src/ge/graph/passes/folding_kernel/permute_kernel.cc b/src/ge/graph/passes/folding_kernel/permute_kernel.cc index a6968300..551ef59e 100644 --- a/src/ge/graph/passes/folding_kernel/permute_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/permute_kernel.cc @@ -33,7 +33,6 @@ #include "graph/passes/folding_kernel/kernel_utils.h" #include "framework/common/ge_inner_error_codes.h" - namespace ge { namespace { const char *const kAttrOrder = "order"; @@ -52,8 +51,8 @@ Status PermuteKernel::ValidateInput(const OpDescPtr &op_desc_ptr, const std::vec return PARAM_INVALID; } const uint8_t *src_data = const_weight_ptr->GetData().data(); - if (op_desc_ptr == nullptr || src_data == nullptr) { - GELOGE(PARAM_INVALID, "Input opDescPtr is nullptr."); + if ((op_desc_ptr == nullptr) || (src_data == nullptr)) { + GELOGW("Input opDescPtr is nullptr."); return PARAM_INVALID; } if (op_desc_ptr->GetInputsSize() >= kTbePermuteInputSize) { @@ -75,21 +74,21 @@ Status PermuteKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetOutputDesc(0); GeTensorDesc op_desc_in = op_desc_ptr->GetInputDesc(0); auto src_format = op_desc_in.GetFormat(); - auto src_shape = op_desc_in.GetShape().GetDims(); + auto src_shape = op_desc_in.GetShape().GetDims(); auto src_data_type = op_desc_in.GetDataType(); auto data_shape = op_desc.GetShape().GetDims(); auto data_format = op_desc.GetFormat(); auto data_type = op_desc.GetDataType(); GELOGD( - "current node %s, format %s, input shape %s, data type %s, weight format %s, shape %s, data type %s. " - "output format %s, shape %s, data type %s", - op_desc_ptr->GetName().c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), - formats::ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::FormatToSerialString(const_weight_ptr->GetTensorDesc().GetFormat()).c_str(), - formats::ShapeToString(const_weight_ptr->GetTensorDesc().GetShape()).c_str(), - TypeUtils::DataTypeToSerialString(const_weight_ptr->GetTensorDesc().GetDataType()).c_str(), - TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(), - TypeUtils::DataTypeToSerialString(data_type).c_str()); + "current node %s, format %s, input shape %s, data type %s, weight format %s, shape %s, data type %s. " + "output format %s, shape %s, data type %s", + op_desc_ptr->GetName().c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), + formats::ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(), + TypeUtils::FormatToSerialString(const_weight_ptr->GetTensorDesc().GetFormat()).c_str(), + formats::ShapeToString(const_weight_ptr->GetTensorDesc().GetShape()).c_str(), + TypeUtils::DataTypeToSerialString(const_weight_ptr->GetTensorDesc().GetDataType()).c_str(), + TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); vector perm_list; if (!AttrUtils::GetListInt(op_desc_ptr, kAttrOrder, perm_list) && @@ -103,7 +102,7 @@ Status PermuteKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector range_supported_type = {DT_INT32, DT_FLOAT}; +const std::set kRangeSupportedType = {DT_INT32, DT_FLOAT}; } // namespace Status RangeKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector &input, @@ -99,23 +99,23 @@ Status RangeKernel::RangeCheck(const std::vector &input) { // check whether the data types are the same DataType type = start->GetTensorDesc().GetDataType(); - if (type != limit->GetTensorDesc().GetDataType() || type != delta->GetTensorDesc().GetDataType()) { + if ((type != limit->GetTensorDesc().GetDataType()) || (type != delta->GetTensorDesc().GetDataType())) { GELOGI("Data type of inputs for Range not matched."); return NOT_CHANGED; } // check whether are all scalars size_t range_dim = static_cast(kRangeDimNum); - bool all_scalar = start->GetTensorDesc().MutableShape().GetDimNum() == range_dim && - limit->GetTensorDesc().MutableShape().GetDimNum() == range_dim && - delta->GetTensorDesc().MutableShape().GetDimNum() == range_dim; + bool all_scalar = (start->GetTensorDesc().MutableShape().GetDimNum() == range_dim) && + (limit->GetTensorDesc().MutableShape().GetDimNum() == range_dim) && + (delta->GetTensorDesc().MutableShape().GetDimNum() == range_dim); if (!all_scalar) { GELOGI("Inputs for Range are not all scalars."); return NOT_CHANGED; } // check if input data type is supported - if (range_supported_type.find(type) == range_supported_type.end()) { + if (kRangeSupportedType.find(type) == kRangeSupportedType.end()) { GELOGI("Range does not support this Data type: %s", TypeUtils::DataTypeToSerialString(type).c_str()); return NOT_CHANGED; } diff --git a/src/ge/graph/passes/folding_kernel/rank_kernel.cc b/src/ge/graph/passes/folding_kernel/rank_kernel.cc index 2dbd5e3d..ae14354b 100644 --- a/src/ge/graph/passes/folding_kernel/rank_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/rank_kernel.cc @@ -46,8 +46,9 @@ Status RankKernel::Compute(const NodePtr &node, std::vector &v_outp return NOT_CHANGED; } - GeTensorDesc input_shape = op_desc->GetInputDesc(kRankDataInputIndex); - auto ndims = input_shape.GetShape().GetDimNum(); + const auto &input_shape = op_desc->MutableInputDesc(kRankDataInputIndex); + GE_CHECK_NOTNULL(input_shape); + auto ndims = input_shape->GetShape().GetDimNum(); GeTensorDesc tensor_desc(op_desc->GetOutputDesc(0)); GeTensorPtr output_ptr; output_ptr = MakeShared(tensor_desc, reinterpret_cast(&ndims), sizeof(ndims)); diff --git a/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc b/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc index 8a6d22c8..76a67dac 100644 --- a/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc @@ -41,7 +41,7 @@ const std::set kReduceProdSupportedType = {DT_INT32}; Status ReduceProdKernel::ReduceProdCheck(const ge::OpDescPtr &op_desc_ptr, const std::vector &input) const { if (op_desc_ptr == nullptr) { - GELOGE(PARAM_INVALID, "input opdesc is nullptr."); + GELOGW("Input opdesc is nullptr."); return PARAM_INVALID; } if (input.size() != kReduceProdInputSize) { @@ -130,14 +130,15 @@ Status ReduceProdKernel::DataCal(const std::vector &input, } int32_t tmp_x = 1; + int32_t tmp_y = 1; for (int64_t i = 0; i < head_dim_; ++i) { for (int64_t j = 0; j < end_dim_; ++j) { // all index for input_data is less than size of input_data tmp_x = input_data[static_cast(i * end_dim_ * axis_dim_ + j)]; for (int64_t k = 1; k < axis_dim_; ++k) { - int32_t tmp_y = input_data[static_cast(i * end_dim_ * axis_dim_ + j + k * end_dim_)]; - if (CheckInt32MulOverflow(tmp_x, tmp_y) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Product is overflow. multiplier 1: %d. multiplier 2: %d.", tmp_x, tmp_y); + tmp_y = input_data[static_cast(i * end_dim_ * axis_dim_ + j + k * end_dim_)]; + if (ge::CheckInt32MulOverflow(tmp_x, tmp_y) != SUCCESS) { + GELOGW("Product is overflow. multiplier 1: %d. multiplier 2: %d.", tmp_x, tmp_y); return INTERNAL_ERROR; } tmp_x *= tmp_y; @@ -163,7 +164,7 @@ void ReduceProdKernel::ShapeCal(const ge::OpDescPtr &op_desc_ptr, const std::vec int32_t data_dim_size = static_cast(data_dims.size()); const uint8_t *axis_data = axis_tensor->GetData().GetData(); if (axis_data == nullptr) { - GE_LOGE("param axis_data must not be null."); + DOMI_LOGE(param axis_data must not be null.); return; } int32_t axis = *(const_cast(reinterpret_cast(axis_data))); @@ -220,8 +221,8 @@ Status ReduceProdKernel::ComputeNoAxis(const ge::OpDescPtr &op_desc_ptr, const s int32_t tmp_y = 1; for (size_t k = 1; k < data_num; ++k) { tmp_y = input_data[k]; - if (CheckInt32MulOverflow(tmp_x, tmp_y) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Product is overflow. multiplier 1: %d. multiplier 2: %d.", tmp_x, tmp_y); + if (ge::CheckInt32MulOverflow(tmp_x, tmp_y) != SUCCESS) { + GELOGW("Product is overflow. multiplier 1: %d. multiplier 2: %d.", tmp_x, tmp_y); return INTERNAL_ERROR; } tmp_x *= tmp_y; @@ -245,7 +246,9 @@ Status ReduceProdKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vec return NOT_CHANGED; } - GeTensorPtr output_ptr = MakeShared(); + // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. + auto output_tensor_desc = op_desc_ptr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "make_shared ge::GeTensor failed, node name %s.", op_desc_ptr->GetName().c_str()); return NOT_CHANGED; diff --git a/src/ge/graph/passes/folding_kernel/reformat_kernel.cc b/src/ge/graph/passes/folding_kernel/reformat_kernel.cc index 0c84f089..8829d4c4 100644 --- a/src/ge/graph/passes/folding_kernel/reformat_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/reformat_kernel.cc @@ -33,11 +33,8 @@ const size_t kReformatFirstInput = 0; const size_t kReformatFirstOutput = 0; } // namespace -Status ReFormatKernel::ValidateInput(const OpDescPtr &op_desc_ptr, const std::vector &input) { - if (op_desc_ptr == nullptr) { - GELOGE(PARAM_INVALID, "Input opDescPtr is nullptr."); - return PARAM_INVALID; - } +Status ReFormatKernel::ValidateInput(const OpDescPtr &op_desc_ptr, const std::vector &input) const { + GE_CHECK_NOTNULL(op_desc_ptr); if (op_desc_ptr->GetInputsSize() != kReFormatInputSize) { GELOGW("trans_op has more than 1 input_size."); return PARAM_INVALID; @@ -63,12 +60,14 @@ Status ReFormatKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetOutputDesc(kReformatFirstOutput); - GeTensorDesc op_desc_in = op_desc_ptr->GetInputDesc(kReformatFirstInput); - auto src_shape = op_desc_in.GetShape().GetDims(); - auto src_dtype = op_desc_in.GetDataType(); - auto dst_shape = op_desc.GetShape().GetDims(); - auto dst_dtype = op_desc.GetDataType(); + const auto &op_desc = op_desc_ptr->MutableOutputDesc(kReformatFirstOutput); + const auto &op_desc_in = op_desc_ptr->MutableInputDesc(kReformatFirstInput); + GE_CHECK_NOTNULL(op_desc); + GE_CHECK_NOTNULL(op_desc_in); + const auto &src_shape = op_desc_in->GetShape().GetDims(); + const auto &src_dtype = op_desc_in->GetDataType(); + const auto &dst_shape = op_desc->GetShape().GetDims(); + const auto &dst_dtype = op_desc->GetDataType(); if (src_dtype != dst_dtype || src_shape != dst_shape) { GELOGW("Check params failed. src data type %s and shape %s should be equal to dst data type %s and shape %s", TypeUtils::DataTypeToSerialString(src_dtype).c_str(), formats::ShapeToString(src_shape).c_str(), diff --git a/src/ge/graph/passes/folding_kernel/reformat_kernel.h b/src/ge/graph/passes/folding_kernel/reformat_kernel.h index 6b9f14c8..e3d49acf 100644 --- a/src/ge/graph/passes/folding_kernel/reformat_kernel.h +++ b/src/ge/graph/passes/folding_kernel/reformat_kernel.h @@ -28,7 +28,7 @@ class ReFormatKernel : public Kernel { std::vector &v_output) override; private: - Status ValidateInput(const OpDescPtr &attr, const std::vector &input); + Status ValidateInput(const OpDescPtr &attr, const std::vector &input) const; }; } // namespace ge diff --git a/src/ge/graph/passes/folding_kernel/rsqrt_kernel.cc b/src/ge/graph/passes/folding_kernel/rsqrt_kernel.cc index ff3199a0..44da2bef 100644 --- a/src/ge/graph/passes/folding_kernel/rsqrt_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/rsqrt_kernel.cc @@ -73,7 +73,9 @@ Status RsqrtKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector(); + // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. + auto output_tensor_desc = op_desc_ptr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "MakeShared GeTensor failed, node name %s.", op_desc_ptr->GetName().c_str()); return NOT_CHANGED; diff --git a/src/ge/graph/passes/folding_kernel/shape_kernel.cc b/src/ge/graph/passes/folding_kernel/shape_kernel.cc index 38beee22..f7475b91 100644 --- a/src/ge/graph/passes/folding_kernel/shape_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/shape_kernel.cc @@ -42,11 +42,13 @@ Status ShapeKernel::Compute(const NodePtr &node, std::vector &v_out GELOGW("Size check fail, inputs size:%zu, outputs size:%zu", op_desc->GetInputsSize(), op_desc->GetOutputsSize()); return NOT_CHANGED; } - if (KernelUtils::IsUnknownShape(op_desc->GetInputDesc(0).GetShape())) { + const auto &input_desc = op_desc->MutableInputDesc(0); + GE_CHECK_NOTNULL(input_desc); + if (KernelUtils::IsUnknownShape(input_desc->GetShape())) { GELOGW("Input shape is unknown, ignore shape kernel."); return NOT_CHANGED; } - vector dims = op_desc->GetInputDesc(0).GetShape().GetDims(); + vector dims = input_desc->GetShape().GetDims(); Status ret = PassUtils::ConstructTensorDescWithData(op_desc->GetOutputDesc(0), dims, v_output); if (ret != SUCCESS) { GELOGE(ret, "Shape kernel construct tensor desc failed!"); diff --git a/src/ge/graph/passes/folding_kernel/shape_n_kernel.cc b/src/ge/graph/passes/folding_kernel/shape_n_kernel.cc index a4dbdedb..8ed546de 100644 --- a/src/ge/graph/passes/folding_kernel/shape_n_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/shape_n_kernel.cc @@ -40,12 +40,15 @@ Status ShapeNKernel::Compute(const NodePtr &node, std::vector &v_ou } for (size_t i = 0; i < op_desc->GetAllInputsDesc().size(); i++) { - if (KernelUtils::IsUnknownShape(op_desc->GetInputDesc(i).GetShape())) { + const auto &input_desc = op_desc->MutableInputDesc(static_cast(i)); + GE_CHECK_NOTNULL(input_desc); + if (KernelUtils::IsUnknownShape(input_desc->GetShape())) { GELOGW("Input %zu shape is unknown, ignore shape_n kernel.", i); return NOT_CHANGED; } - vector dims = op_desc->GetInputDesc(i).GetShape().GetDims(); - Status ret = PassUtils::ConstructTensorDescWithData(op_desc->GetOutputDesc(i), dims, v_output); + vector dims = input_desc->GetShape().GetDims(); + Status ret = + PassUtils::ConstructTensorDescWithData(op_desc->GetOutputDesc(static_cast(i)), dims, v_output); if (ret != SUCCESS) { GELOGE(PARAM_INVALID, "ShapeN kernel construct tensor desc failed, i:%zu", i); return ret; diff --git a/src/ge/graph/passes/folding_kernel/size_kernel.cc b/src/ge/graph/passes/folding_kernel/size_kernel.cc index c5d59fd3..3b121ba4 100644 --- a/src/ge/graph/passes/folding_kernel/size_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/size_kernel.cc @@ -17,7 +17,6 @@ #include "graph/passes/folding_kernel/size_kernel.h" #include - #include #include "common/debug/log.h" diff --git a/src/ge/graph/passes/folding_kernel/slice_d_kernel.cc b/src/ge/graph/passes/folding_kernel/slice_d_kernel.cc new file mode 100644 index 00000000..2660537a --- /dev/null +++ b/src/ge/graph/passes/folding_kernel/slice_d_kernel.cc @@ -0,0 +1,161 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/folding_kernel/slice_d_kernel.h" + +#include + +#include "common/fp16_t.h" +#include "common/op/ge_op_utils.h" +#include "common/types.h" +#include "framework/common/debug/ge_log.h" +#include "graph/passes/folding_kernel/kernel_utils.h" +#include "graph/utils/type_utils.h" +#include "inc/kernel_factory.h" + +using ge::fp16_t; + +namespace ge { +namespace { +const int64_t kDimMinusOne = -1; +const int64_t kDimZero = 0; +const int64_t KStrideLengthOne = 1; +const size_t kSliceDInputSize = 1; +const size_t kSliceDOutputSize = 1; +const char *const kSliceDAttrBegin = "offsets"; +const char *const kSliceDAttrSize = "size"; +} // namespace +Status SliceDKernel::SliceDCheck(const OpDescPtr &op_desc_ptr, const std::vector &input, + std::vector &begin_list, std::vector &size_list) { + // Check input size and output size + if ((input.size() != kSliceDInputSize) || (op_desc_ptr->GetInputsSize() != kSliceDInputSize) || + (op_desc_ptr->GetOutputsSize() != kSliceDOutputSize)) { + GELOGW("Unexpected SliceD node, node input size: %zu, node output size: %zu, node name: %s.", input.size(), + op_desc_ptr->GetOutputsSize(), op_desc_ptr->GetName().c_str()); + return PARAM_INVALID; + } + ConstGeTensorPtr x_tensor = input.at(0); // index 0 is guaranteed to be valid by input size check. + if (x_tensor == nullptr) { + GELOGW("SliceDKernel input tensor is nullptr."); + return PARAM_INVALID; + } + // Check data + if (x_tensor->GetData().size() == 0) { + GELOGW("SliceDKernel data size of input is 0, node name: %s.", op_desc_ptr->GetName().c_str()); + return PARAM_INVALID; + } + + // Get attr; + if (!AttrUtils::GetListInt(op_desc_ptr, kSliceDAttrBegin, begin_list)) { + GELOGW("SliceDKernel get attr begin failed, node name: %s.", op_desc_ptr->GetName().c_str()); + return PARAM_INVALID; + } + if (!AttrUtils::GetListInt(op_desc_ptr, kSliceDAttrSize, size_list)) { + GELOGW("SliceDKernel get attr size failed, node name: %s.", op_desc_ptr->GetName().c_str()); + return PARAM_INVALID; + } + // Check attr; + vector x_dims = x_tensor->GetTensorDesc().GetShape().GetDims(); + size_t x_dim_size = x_dims.size(); + if (x_dim_size != begin_list.size() || x_dim_size != size_list.size()) { + GELOGW("SliceDKernel rank of all shapes must be the same, input: %zu, begin: %zu, size: %zu, node name: %s.", + x_dim_size, begin_list.size(), size_list.size(), op_desc_ptr->GetName().c_str()); + return PARAM_INVALID; + } + for (size_t i = 0; i < x_dim_size; i++) { + int64_t x_dim_i = x_dims[i]; + int64_t begin_i = begin_list[i]; + int64_t size_i = size_list[i]; + if ((begin_i < kDimZero) || (begin_i > x_dim_i) || (size_i < kDimMinusOne) || (size_i > x_dim_i)) { + GELOGW("SliceDKernel dim[%zu] of attr is out of range, node name: %s.", i, op_desc_ptr->GetName().c_str()); + return PARAM_INVALID; + } + } + + return SUCCESS; +} + +Status SliceDKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector &input, + std::vector &v_output) { + GELOGD("SliceDKernel in"); + if (op_desc_ptr == nullptr) { + GELOGW("SliceDKernel input opdesc is nullptr."); + return NOT_CHANGED; + } + + vector begin_list; + vector size_list; + if (SliceDCheck(op_desc_ptr, input, begin_list, size_list) != SUCCESS) { + GELOGW("SliceDKernel input is invalid, failed to fold node."); + return NOT_CHANGED; + } + + ConstGeTensorPtr x_tensor = input.at(0); // index 0 is guaranteed to be valid by input size check. + vector x_dims = x_tensor->GetTensorDesc().GetShape().GetDims(); + vector stride_list; + + bool has_zero_dim = false; + for (size_t i = 0; i < x_dims.size(); i++) { + int64_t x_dim_i = x_dims[i]; + int64_t begin_i = begin_list[i]; + int64_t size_i = size_list[i]; + if (size_i == kDimMinusOne) { + size_i = x_dim_i - begin_i; + size_list[i] = size_i; + } else if (begin_i + size_i > x_dim_i) { + GELOGW("SliceDKernel dim[%zu] of attr size is out of range, node name: %s.", i, op_desc_ptr->GetName().c_str()); + return NOT_CHANGED; + } + stride_list.push_back(KStrideLengthOne); + + // 0 appears in dims of input tensor or size tensor + if (size_i == kDimZero || x_dim_i == kDimZero) { + has_zero_dim = true; + } + } + + auto x_data_type = x_tensor->GetTensorDesc().GetDataType(); + auto output_tensor_desc = op_desc_ptr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); + if (output_ptr == nullptr) { + GELOGE(MEMALLOC_FAILED, "Failed to fold node %s, out of memory", op_desc_ptr->GetName().c_str()); + return NOT_CHANGED; + } + + output_ptr->MutableTensorDesc().SetShape(GeShape(size_list)); + output_ptr->MutableTensorDesc().SetDataType(x_data_type); + if (has_zero_dim) { + v_output.emplace_back(output_ptr); + GELOGI("SliceD folding kernel success, and output tensor has no data."); + return SUCCESS; + } + + void *data = reinterpret_cast(const_cast(x_tensor->GetData().data())); + int64_t x_data_size = x_tensor->GetTensorDesc().GetShape().GetShapeSize(); + Status ret = OpUtils::SetOutputSliceData(data, x_data_size, x_data_type, x_dims, begin_list, size_list, + output_ptr.get(), stride_list); + if (ret != SUCCESS) { + GELOGW("Set output data of SliceD failed."); + return NOT_CHANGED; + } + + v_output.emplace_back(output_ptr); + GELOGI("SliceD folding kernel success."); + return SUCCESS; +} + +REGISTER_KERNEL(SLICED, SliceDKernel); +} // namespace ge diff --git a/src/ge/graph/passes/folding_kernel/slice_d_kernel.h b/src/ge/graph/passes/folding_kernel/slice_d_kernel.h new file mode 100644 index 00000000..9fe35352 --- /dev/null +++ b/src/ge/graph/passes/folding_kernel/slice_d_kernel.h @@ -0,0 +1,35 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_SLICED_KERNEL_H_ +#define GE_GRAPH_PASSES_FOLDING_KERNEL_SLICED_KERNEL_H_ + +#include +#include "inc/kernel.h" + +namespace ge { +class SliceDKernel : public Kernel { + public: + Status Compute(const OpDescPtr op_desc_ptr, const std::vector &input, + std::vector &v_output) override; + + private: + Status SliceDCheck(const OpDescPtr &op_desc_ptr, const std::vector &input, + std::vector &begin_list, std::vector &size_list); +}; +} // namespace ge + +#endif // GE_GRAPH_PASSES_FOLDING_KERNEL_SLICED_KERNEL_H_ diff --git a/src/ge/graph/passes/folding_kernel/slice_kernel.cc b/src/ge/graph/passes/folding_kernel/slice_kernel.cc index d14e740a..a1250367 100644 --- a/src/ge/graph/passes/folding_kernel/slice_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/slice_kernel.cc @@ -28,16 +28,16 @@ namespace ge { namespace { const size_t kSliceInputSize = 3; -const size_t kSliceInputIndex0 = 0; -const size_t kSliceInputIndex1 = 1; -const size_t kSliceInputIndex2 = 2; +const size_t kSliceInputIndexX = 0; +const size_t kSliceInputIndexBegin = 1; +const size_t kSliceInputIndexSize = 2; } // namespace Status SliceKernel::Compute(const OpDescPtr attr, const std::vector &input, vector &v_output) { GELOGI("SliceKernel in."); if (attr == nullptr) { - GELOGE(PARAM_INVALID, "input opdescptr is nullptr."); + GELOGW("Input opdescptr is nullptr."); return NOT_CHANGED; } // check input size @@ -46,9 +46,9 @@ Status SliceKernel::Compute(const OpDescPtr attr, const std::vector - #include #include #include @@ -184,13 +183,13 @@ Status SsdPriorboxKernel::SetVariance(const vector &variance, const int d Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint aspect_ratios_size, uint min_sizes_size, uint max_sizes_size, int layer_width, int layer_height, int &num_priors, int &dim_size) const { - if (CheckUint32MulOverflow(min_sizes_size, aspect_ratios_size) != SUCCESS) { + if (ge::CheckUint32MulOverflow(min_sizes_size, aspect_ratios_size) != SUCCESS) { return PARAM_INVALID; } uint tmp_value = aspect_ratios_size * min_sizes_size; - if (CheckUint32AddOverflow(tmp_value, max_sizes_size) != SUCCESS) { - GELOGE(PARAM_INVALID, "Failed to get list param."); + if (ge::CheckUint32AddOverflow(tmp_value, max_sizes_size) != SUCCESS) { + GELOGW("Failed to get list param."); return PARAM_INVALID; } tmp_value += max_sizes_size; @@ -201,18 +200,18 @@ Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint aspect_ratios_size, uint mi } num_priors = static_cast(tmp_value); - if (CheckIntMulOverflow(layer_width, layer_height) != SUCCESS) { - GELOGE(PARAM_INVALID, "Failed to get list param."); + if (ge::CheckIntMulOverflow(layer_width, layer_height) != SUCCESS) { + GELOGW("Failed to get list param."); return PARAM_INVALID; } - if (CheckIntMulOverflow(layer_width * layer_height, num_priors) != SUCCESS) { - GELOGE(PARAM_INVALID, "Failed to get list param."); + if (ge::CheckIntMulOverflow(layer_width * layer_height, num_priors) != SUCCESS) { + GELOGW("Failed to get list param."); return PARAM_INVALID; } - if (CheckIntMulOverflow(layer_width * layer_height * num_priors, kNumVariance) != SUCCESS) { - GELOGE(PARAM_INVALID, "Failed to get list param."); + if (ge::CheckIntMulOverflow(layer_width * layer_height * num_priors, kNumVariance) != SUCCESS) { + GELOGW("Failed to get list param."); return PARAM_INVALID; } dim_size = layer_width * layer_height * num_priors * kNumVariance; // 4 variance diff --git a/src/ge/graph/passes/folding_kernel/strided_slice_kernel.cc b/src/ge/graph/passes/folding_kernel/strided_slice_kernel.cc index c3728899..224cf7a8 100644 --- a/src/ge/graph/passes/folding_kernel/strided_slice_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/strided_slice_kernel.cc @@ -20,13 +20,13 @@ #include "common/fp16_t.h" #include "common/ge_inner_error_codes.h" +#include "common/math/math_util.h" #include "common/op/ge_op_utils.h" #include "framework/common/debug/ge_log.h" #include "graph/passes/folding_kernel/kernel_utils.h" #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" - namespace ge { namespace { const int32_t kNumOne = 1; @@ -35,6 +35,7 @@ const size_t kStridedSliceInputIndex0 = 0; const size_t kStridedSliceInputIndex1 = 1; const size_t kStridedSliceInputIndex2 = 2; const size_t kStridedSliceInputIndex3 = 3; +const int32_t kDefaultSrideSize = 1; } // namespace Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr, const std::vector &input, Attr &args) { @@ -72,16 +73,18 @@ Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr, const std::vec GELOGE(PARAM_INVALID, "get shrink_axis_mask attr failed."); return PARAM_INVALID; } - if (!(ellipsis_mask == 0 && new_axis_mask == 0)) { - GELOGE(NOT_CHANGED, "ellipsis_mask or new_axis_mask must be 0 with optimizer."); + if ((ellipsis_mask != 0) || (new_axis_mask != 0)) { + GELOGW("ellipsis_mask or new_axis_mask must be 0 with optimizer."); return NOT_CHANGED; } - DataType data_type = attr->GetInputDesc(kStridedSliceInputIndex0).GetDataType(); - if (!(data_type == DT_FLOAT || data_type == DT_INT32)) { + const auto &input_desc = attr->MutableInputDesc(kStridedSliceInputIndex0); + GE_CHECK_NOTNULL(input_desc); + DataType data_type = input_desc->GetDataType(); + if ((data_type != DT_FLOAT) && (data_type != DT_INT32)) { GELOGW( - "Data type of StridedSlice OP must be float or int32." - "Constant folding will not be carried out in this condition" - "which might affect the time performance but not the accuracy"); + "Data type of StridedSlice OP must be float or int32." + "Constant folding will not be carried out in this condition" + "which might affect the time performance but not the accuracy"); } args.begin_mask = begin_mask; args.end_mask = end_mask; @@ -103,8 +106,8 @@ Status StridedSliceKernel::CheckAndGetAttr(const OpDescPtr &attr, const std::vec } Status StridedSliceKernel::CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1, const ConstGeTensorPtr &weight2, const ConstGeTensorPtr &weight3) const { - if (weight0 == nullptr || weight1 == nullptr || weight2 == nullptr || weight3 == nullptr) { - GELOGE(PARAM_INVALID, "weight is nullptr."); + if ((weight0 == nullptr) || (weight1 == nullptr) || (weight2 == nullptr) || (weight3 == nullptr)) { + GELOGW("weight is nullptr."); return PARAM_INVALID; } if (!(weight1->GetTensorDesc().GetDataType() == DT_INT32 && weight2->GetTensorDesc().GetDataType() == DT_INT32 && @@ -118,25 +121,26 @@ Status StridedSliceKernel::CheckWeight(const ConstGeTensorPtr &weight0, const Co size_t weight1_size = weight1->GetData().size() / sizeof(int32_t); size_t weight2_size = weight2->GetData().size() / sizeof(int32_t); size_t weight3_size = weight3->GetData().size() / sizeof(int32_t); - if (weight0_size == 0 || weight1_size == 0 || weight2_size == 0 || weight3_size == 0) { + if ((weight0_size == 0) || (weight1_size == 0) || (weight2_size == 0) || (weight3_size == 0)) { GELOGW("Data size of inputs is 0."); return PARAM_INVALID; } // check dim size size_t weight0_dim_size = weight0->GetTensorDesc().GetShape().GetDimNum(); - if (!(weight0_dim_size == weight1_size && weight0_dim_size == weight2_size && weight0_dim_size == weight3_size)) { - GELOGE(PARAM_INVALID, "The sizes of begin, end and stride is not supported."); + if (!((weight0_dim_size >= weight1_size) && (weight1_size == weight2_size) && (weight1_size == weight3_size))) { + GELOGW("The sizes of begin, end and stride is not supported."); return NOT_CHANGED; } return SUCCESS; } -void StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, - int32_t &begin_i, int32_t &end_i, int32_t &dim_i) const { +Status StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, + int32_t &begin_i, int32_t &end_i, int32_t &dim_i) const { if (shrink_mask_flag) { begin_i = (begin_i < 0 ? (dim_i + begin_i) : begin_i); + FMK_INT32_ADDCHECK(begin_i, kNumOne); end_i = begin_i + kNumOne; } else { if (begin_mask_flag) { @@ -150,6 +154,18 @@ void StridedSliceKernel::MaskCal(const bool &begin_mask_flag, const bool &end_ma end_i = (end_i < 0 ? (dim_i + end_i) : end_i); } } + return SUCCESS; +} + +void StridedSliceKernel::GetOutputDims(uint32_t dims_size, const std::vector &output_dims, const Attr &args, + vector &v_dims) { + for (uint32_t k = 0; k < dims_size; k++) { + bool shrink_mask_i = (static_cast(args.shrink_axis_mask) & (1 << k)); + if (shrink_mask_i) { + continue; + } + v_dims.push_back(output_dims[k]); + } } Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector &input, @@ -183,6 +199,7 @@ Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector begin_vec; std::vector output_dims; std::vector stride_vec; + int64_t dim_final; for (size_t i = 0; i < dim_size; i++) { int32_t begin_i = begin[i]; int32_t end_i = end[i]; @@ -193,20 +210,32 @@ Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector(args.begin_mask) & (1 << i_temp)); bool end_mask_i = (static_cast(args.end_mask) & (1 << i_temp)); bool shrink_mask_i = (static_cast(args.shrink_axis_mask) & (1 << i_temp)); - MaskCal(begin_mask_i, end_mask_i, shrink_mask_i, begin_i, end_i, dim_i); - if (stride_i <= 0 || end_i <= begin_i) { - GELOGE(INTERNAL_ERROR, "Param for stride_slice is invalid."); + ret = MaskCal(begin_mask_i, end_mask_i, shrink_mask_i, begin_i, end_i, dim_i); + if (ret != SUCCESS) { + GELOGW("MaskCal failed, because of data overflow."); return NOT_CHANGED; } - - int64_t dim_final = (end_i - begin_i) / stride_i; + if (stride_i == 0) { + stride_i = kDefaultSrideSize; + } else if (stride_i < 0) { + stride_i = -stride_i; + begin_i = x_shape.GetDim(i) - begin_i - 1; + end_i = x_shape.GetDim(i) - end_i - 1; + } + if ((begin_i == 0) && (end_i == 0)) { + dim_final = x_shape.GetDim(i); + } else { + dim_final = abs(end_i - begin_i) / stride_i; + } output_dims.push_back(dim_final); input_dims.push_back(x_shape.GetDim(i)); begin_vec.push_back(begin_i); stride_vec.push_back(stride_i); } - GeTensorPtr output_ptr = MakeShared(); + // Index 0 can always gets a GeTensorDesc object from any OpDescPtr. + auto output_tensor_desc = attr->GetOutputDesc(0); + GeTensorPtr output_ptr = MakeShared(output_tensor_desc); if (output_ptr == nullptr) { GELOGE(MEMALLOC_FAILED, "MakeShared GeTensor failed, node name %s.", attr->GetName().c_str()); return NOT_CHANGED; @@ -226,13 +255,7 @@ Status StridedSliceKernel::Compute(const ge::OpDescPtr attr, const std::vector(output_dims.size()); vector v_dims; - for (uint32_t k = 0; k < final_dim_size; k++) { - bool shrink_mask_i = (static_cast(args.shrink_axis_mask) & (1 << k)); - if (shrink_mask_i) { - continue; - } - v_dims.push_back(output_dims[k]); - } + GetOutputDims(final_dim_size, output_dims, args, v_dims); t_d.SetShape(GeShape(v_dims)); v_output.push_back(output_ptr); GELOGI("StridedSliceKernel success."); diff --git a/src/ge/graph/passes/folding_kernel/strided_slice_kernel.h b/src/ge/graph/passes/folding_kernel/strided_slice_kernel.h index 047bc880..e569b2d0 100644 --- a/src/ge/graph/passes/folding_kernel/strided_slice_kernel.h +++ b/src/ge/graph/passes/folding_kernel/strided_slice_kernel.h @@ -40,9 +40,10 @@ class StridedSliceKernel : public Kernel { Status CheckAndGetAttr(const OpDescPtr &attr, const std::vector &input, Attr &args); Status CheckWeight(const ConstGeTensorPtr &weight0, const ConstGeTensorPtr &weight1, const ConstGeTensorPtr &weight2, const ConstGeTensorPtr &weight3) const; - void MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, int32_t &begin_i, - int32_t &end_i, int32_t &dim_i) const; + Status MaskCal(const bool &begin_mask_flag, const bool &end_mask_flag, const bool &shrink_mask_flag, int32_t &begin_i, + int32_t &end_i, int32_t &dim_i) const; + void GetOutputDims(uint32_t dims_size, const std::vector &output_dims, const Attr &args, + vector &v_dims); }; } // namespace ge - #endif // GE_GRAPH_PASSES_FOLDING_KERNEL_STRIDED_SLICE_KERNEL_H_ diff --git a/src/ge/graph/passes/folding_kernel/sub_kernel.cc b/src/ge/graph/passes/folding_kernel/sub_kernel.cc index 4fe71bb0..5934c6c1 100644 --- a/src/ge/graph/passes/folding_kernel/sub_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/sub_kernel.cc @@ -16,10 +16,12 @@ #include "graph/passes/folding_kernel/sub_kernel.h" +#include +#include #include #include "common/debug/log.h" -#include "common/fp16_t.h" +#include "common/math/math_util.h" #include "common/op/ge_op_utils.h" #include "graph/common/bcast.h" #include "graph/utils/type_utils.h" @@ -33,31 +35,81 @@ const size_t kSubFirstOutput = 0; const size_t kSubOutputSize = 1; const size_t kSubInputSize = 2; -#define DEFINE_FUNC_BY_TYPE(TYPE) \ - std::function func_##TYPE = [](TYPE const &a, TYPE const &b) -> TYPE { \ - return a - b; \ +template +Status OverflowCheck(T const &x, T const &y, DataType &data_type) { + switch (data_type) { + case DT_INT8: + FMK_INT8_SUBCHECK(x, y) + break; + case DT_INT16: + FMK_INT16_SUBCHECK(x, y) + break; + case DT_INT32: + FMK_INT32_SUBCHECK(x, y) + break; + case DT_INT64: + FMK_INT64_SUBCHECK(x, y) + break; + case DT_UINT8: + FMK_UINT8_SUBCHECK(x, y) + break; + case DT_UINT16: + FMK_UINT16_SUBCHECK(x, y) + break; + case DT_UINT32: + FMK_UINT32_SUBCHECK(x, y) + break; + case DT_UINT64: + FMK_UINT64_SUBCHECK(x, y) + break; + case DT_FLOAT16: + FMK_FP16_SUBCHECK(x, y) + break; + case DT_FLOAT: + FMK_FLOAT_SUBCHECK(x, y) + break; + case DT_DOUBLE: + FMK_DOUBLE_SUBCHECK(x, y) + break; + default: + break; + } + + return SUCCESS; +} + +#define DEFINE_FUNC_WITH_STATUS_BY_TYPE(TYPE) \ + std::function func_##TYPE = \ + [](TYPE const &x, TYPE const &y, DataType &type, Status &ret) -> TYPE { \ + ret = OverflowCheck(x, y, type); \ + if (ret != SUCCESS) { \ + GELOGE(PARAM_INVALID, "Result of sub is overflow."); \ + return static_cast(0); \ + } \ + return static_cast(x) - static_cast(y); \ }; -#define SET_BCAST_COMPUTE_CASE(DTYPE, TYPE) \ - case DTYPE: \ - ret = bcast.BCastCompute(input, y_data_##TYPE, func_##TYPE); \ +#define SET_BCAST_COMPUTE_CASE(DTYPE, TYPE) \ + case DTYPE: \ + ret = bcast.BCastComputeCheck(input, y_data_##TYPE##_, func_##TYPE); \ break; -#define SET_OUTPUT(DTYPE, TYPE) \ - case DTYPE: \ - (void)output_ptr->SetData(reinterpret_cast(y_data_##TYPE.data()), y_data_##TYPE.size() * length); \ +#define SET_OUTPUT(DTYPE, TYPE) \ + case DTYPE: \ + (void)output_ptr->SetData(reinterpret_cast(y_data_##TYPE##_.data()), y_data_##TYPE##_.size() * length); \ break; -DEFINE_FUNC_BY_TYPE(int8_t) -DEFINE_FUNC_BY_TYPE(int16_t) -DEFINE_FUNC_BY_TYPE(int32_t) -DEFINE_FUNC_BY_TYPE(int64_t) -DEFINE_FUNC_BY_TYPE(uint8_t) -DEFINE_FUNC_BY_TYPE(uint16_t) -DEFINE_FUNC_BY_TYPE(uint32_t) -DEFINE_FUNC_BY_TYPE(uint64_t) -DEFINE_FUNC_BY_TYPE(float) -DEFINE_FUNC_BY_TYPE(double) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int8_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int16_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int32_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(int64_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint8_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint16_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint32_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(uint64_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(fp16_t) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(float) +DEFINE_FUNC_WITH_STATUS_BY_TYPE(double) } // namespace Status SubKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vector &input, @@ -66,7 +118,7 @@ Status SubKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vectorGetOutputsSize() != kSubOutputSize)) { GELOGW("The number of input for sub must be %zu.", kSubInputSize); - return PARAM_INVALID; + return NOT_CHANGED; } GE_CHECK_NOTNULL(input[kSubFirstInput]); @@ -74,19 +126,6 @@ Status SubKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vector y_data_int8_t; - std::vector y_data_int16_t; - std::vector y_data_int32_t; - std::vector y_data_int64_t; - std::vector y_data_uint8_t; - std::vector y_data_uint16_t; - std::vector y_data_uint32_t; - std::vector y_data_uint64_t; - std::vector y_data_fp16_t; - std::vector y_data_float; - std::vector y_data_double; - Status ret; DataType data_type = input[kSubFirstInput]->GetTensorDesc().GetDataType(); BCast bcast; @@ -99,9 +138,11 @@ Status SubKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vector #include "inc/kernel.h" +#include "common/fp16_t.h" namespace ge { class SubKernel : public Kernel { public: Status Compute(const ge::OpDescPtr attr, const std::vector &input, vector &v_output) override; + + private: + std::vector y_data_int8_t_; + std::vector y_data_int16_t_; + std::vector y_data_int32_t_; + std::vector y_data_int64_t_; + std::vector y_data_uint8_t_; + std::vector y_data_uint16_t_; + std::vector y_data_uint32_t_; + std::vector y_data_uint64_t_; + std::vector y_data_fp16_t_; + std::vector y_data_float_; + std::vector y_data_double_; }; } // namespace ge diff --git a/src/ge/graph/passes/folding_kernel/transdata_kernel.cc b/src/ge/graph/passes/folding_kernel/transdata_kernel.cc index 46ba8805..b1bfe92d 100644 --- a/src/ge/graph/passes/folding_kernel/transdata_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/transdata_kernel.cc @@ -33,7 +33,6 @@ #include "graph/utils/type_utils.h" #include "inc/kernel_factory.h" - namespace ge { namespace { const size_t kTransdataInputSize = 1; @@ -71,24 +70,26 @@ Status TransdataKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetOutputDesc(0); - GeTensorDesc op_desc_in = op_desc_ptr->GetInputDesc(0); - auto src_format = op_desc_in.GetFormat(); - auto src_shape = op_desc_in.GetShape().GetDims(); - auto src_data_type = op_desc_in.GetDataType(); - auto data_shape = op_desc.GetShape().GetDims(); - auto data_format = op_desc.GetFormat(); - auto data_type = op_desc.GetDataType(); + const auto &op_desc = op_desc_ptr->MutableOutputDesc(0); + const auto &op_desc_in = op_desc_ptr->MutableInputDesc(0); + GE_CHECK_NOTNULL(op_desc); + GE_CHECK_NOTNULL(op_desc_in); + const auto &src_format = op_desc_in->GetFormat(); + const auto &src_shape = op_desc_in->GetShape().GetDims(); + const auto &src_data_type = op_desc_in->GetDataType(); + const auto &data_shape = op_desc->GetShape().GetDims(); + const auto &data_format = op_desc->GetFormat(); + const auto &data_type = op_desc->GetDataType(); GELOGD( - "current node %s, format %s, input shape %s, data type %s, weight format %s, shape %s, data type %s. " - "output format %s, shape %s, data type %s", - op_desc_ptr->GetName().c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), - formats::ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::FormatToSerialString(const_weight_ptr->GetTensorDesc().GetFormat()).c_str(), - formats::ShapeToString(const_weight_ptr->GetTensorDesc().GetShape()).c_str(), - TypeUtils::DataTypeToSerialString(const_weight_ptr->GetTensorDesc().GetDataType()).c_str(), - TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(), - TypeUtils::DataTypeToSerialString(data_type).c_str()); + "current node %s, format %s, input shape %s, data type %s, weight format %s, shape %s, data type %s. " + "output format %s, shape %s, data type %s", + op_desc_ptr->GetName().c_str(), TypeUtils::FormatToSerialString(src_format).c_str(), + formats::ShapeToString(src_shape).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str(), + TypeUtils::FormatToSerialString(const_weight_ptr->GetTensorDesc().GetFormat()).c_str(), + formats::ShapeToString(const_weight_ptr->GetTensorDesc().GetShape()).c_str(), + TypeUtils::DataTypeToSerialString(const_weight_ptr->GetTensorDesc().GetDataType()).c_str(), + TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(), + TypeUtils::DataTypeToSerialString(data_type).c_str()); const uint8_t *src_data = const_weight_ptr->GetData().data(); const formats::TransArgs trans_args{src_data, src_format, data_format, src_shape, data_shape, src_data_type}; diff --git a/src/ge/graph/passes/folding_kernel/unpack_kernel.cc b/src/ge/graph/passes/folding_kernel/unpack_kernel.cc new file mode 100644 index 00000000..92ad140a --- /dev/null +++ b/src/ge/graph/passes/folding_kernel/unpack_kernel.cc @@ -0,0 +1,91 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/folding_kernel/unpack_kernel.h" +#include "common/debug/ge_log.h" +#include "common/op/ge_op_utils.h" +#include "common/op/ge_op_utils.h" +#include "common/types.h" +#include "graph/debug/ge_attr_define.h" +#include "inc/kernel_factory.h" + +namespace ge { +namespace { +const size_t kUnpackInputNum = 1; +} // namespace +template +Status CalcUpack(const int32_t num, const DataType data_type, const T *value, std::vector &v_output) { + GE_CHECK_NOTNULL(value); + // not support num=0 + if (num > 0) { + unique_ptr buf(new (std::nothrow) T[num]()); + GE_CHECK_NOTNULL(buf); + for (int32_t i = 0; i < num; ++i) { + GeTensorPtr output_ptr = ge::MakeShared(); + GE_CHECK_NOTNULL(output_ptr); + + buf[i] = *value; + ++value; + GE_CHK_STATUS_RET(output_ptr->SetData(reinterpret_cast(&buf[i]), sizeof(T)), + "unpack set data failed!"); + output_ptr->MutableTensorDesc().SetDataType(data_type); + v_output.push_back(output_ptr); + } + } else { + GELOGW("num <= 0 is not support."); + return NOT_CHANGED; + } + return SUCCESS; +} + +Status UnpackKernel::Compute(const OpDescPtr attr, const std::vector &input, + std::vector &v_output) { + GE_CHECK_NOTNULL(attr); + // check input num + GE_RT_PARAM_INVALID_WITH_LOG_IF_FALSE(input.size() == kUnpackInputNum, + "The number of input for unpack must be %zu, real is %zu.", kUnpackInputNum, + input.size()); + + ConstGeTensorPtr dims = input[0]; + GE_CHECK_NOTNULL(dims); + ge::DataType data_type; + GE_CHK_BOOL_RET_STATUS(AttrUtils::GetDataType(attr, ATTR_NAME_T, data_type), PARAM_INVALID, "get T attr failed."); + // data_type must be FLOAT or INT32 + GE_CHK_BOOL_RET_STATUS((data_type == DT_FLOAT || data_type == DT_INT32), PARAM_INVALID, "T must be float or int32."); + + // input dim size must = 1 + GE_RT_PARAM_INVALID_WITH_LOG_IF_FALSE((dims->GetTensorDesc().GetShape().GetDimNum() == 1), + "input tensor must be 1 dim, real is %zu.", + dims->GetTensorDesc().GetShape().GetDimNum()); + + int64_t num = 0; + GE_CHK_BOOL_RET_STATUS(AttrUtils::GetInt(attr, UNPACK_ATTR_NAME_NUM, num), PARAM_INVALID, "get num attr failed."); + size_t data_count = dims->GetData().size() / sizeof(float); + // num must equal to input_data size + GE_RT_PARAM_INVALID_WITH_LOG_IF_FALSE(data_count == static_cast(num), + "input tensor size not equal num, data_count:%zu, num:%ld.", data_count, num); + // calculate result + if (data_type == DT_FLOAT) { + GE_RETURN_IF_ERROR(CalcUpack(num, data_type, reinterpret_cast(dims->GetData().data()), v_output)); + } else { + GE_RETURN_IF_ERROR(CalcUpack(num, data_type, reinterpret_cast(dims->GetData().data()), v_output)); + } + + return SUCCESS; +} + +REGISTER_KERNEL(UNPACK, UnpackKernel); +} // namespace ge diff --git a/src/ge/graph/optimize/optimizer/pass.h b/src/ge/graph/passes/folding_kernel/unpack_kernel.h similarity index 60% rename from src/ge/graph/optimize/optimizer/pass.h rename to src/ge/graph/passes/folding_kernel/unpack_kernel.h index e545ae8b..e8b6f901 100644 --- a/src/ge/graph/optimize/optimizer/pass.h +++ b/src/ge/graph/passes/folding_kernel/unpack_kernel.h @@ -14,27 +14,17 @@ * limitations under the License. */ -#ifndef GE_GRAPH_OPTIMIZE_OPTIMIZER_PASS_H_ -#define GE_GRAPH_OPTIMIZE_OPTIMIZER_PASS_H_ - -#include -#include "common/ge_inner_error_codes.h" +#ifndef GE_GRAPH_PASSES_FOLDING_KERNEL_UNPACK_KERNEL_H_ +#define GE_GRAPH_PASSES_FOLDING_KERNEL_UNPACK_KERNEL_H_ +#include +#include "inc/kernel.h" namespace ge { -/// -/// @ingroup domi_omg -/// @brief pass -/// @author -/// -template -class Pass { +class UnpackKernel : public Kernel { public: - virtual ~Pass() {} - /// - /// run pass - /// @author - /// - virtual Status Run(std::shared_ptr) = 0; + virtual Status Compute(const ge::OpDescPtr attr, const std::vector &input, + std::vector &v_output) override; }; } // namespace ge -#endif // GE_GRAPH_OPTIMIZE_OPTIMIZER_PASS_H_ + +#endif // GE_GRAPH_PASSES_FOLDING_KERNEL_UNPACK_KERNEL_H_ diff --git a/src/ge/graph/passes/folding_pass.cc b/src/ge/graph/passes/folding_pass.cc index fef34eb3..41528ec3 100644 --- a/src/ge/graph/passes/folding_pass.cc +++ b/src/ge/graph/passes/folding_pass.cc @@ -28,6 +28,7 @@ #include "inc/kernel.h" #include "inc/kernel_factory.h" #include "graph/debug/ge_attr_define.h" +#include "ge_local_engine/engine/host_cpu_engine.h" namespace ge { namespace folding_pass { @@ -46,6 +47,10 @@ shared_ptr GetKernelByType(const NodePtr &node) { return factory.Create(type); } +bool IsNoNeedConstantFolding(const NodePtr &node) { + auto node_desc = node->GetOpDesc(); + return node_desc == nullptr || node_desc->HasAttr(ATTR_NO_NEED_CONSTANT_FOLDING); +} } // namespace folding_pass namespace { @@ -107,6 +112,11 @@ NodePtr AddIdentityNodeToGraph(const std::string &name, const GeTensorDesc &tens } } // namespace +Status FoldingPass::RunOpKernel(NodePtr &node, const vector &inputs, + std::vector &outputs) { + return HostCpuEngine::GetInstance().Run(node, inputs, outputs); +} + Status FoldingPass::Folding(NodePtr &node, vector &outputs) { GE_CHECK_NOTNULL(node); GELOGD("begin folding node:%s", node->GetName().c_str()); diff --git a/src/ge/graph/passes/folding_pass.h b/src/ge/graph/passes/folding_pass.h index 41215dfe..9c8d3a7e 100644 --- a/src/ge/graph/passes/folding_pass.h +++ b/src/ge/graph/passes/folding_pass.h @@ -27,13 +27,15 @@ namespace ge { namespace folding_pass { shared_ptr GetKernelByType(const NodePtr &node); -} +bool IsNoNeedConstantFolding(const NodePtr &node); +} // namespace folding_pass using IndexsToAnchors = std::map>; class FoldingPass : public BaseNodePass { protected: Status Folding(NodePtr &node, vector &outputs); + static Status RunOpKernel(NodePtr &node, const vector &inputs, vector &outputs); private: Status AddConstNode(NodePtr &node, IndexsToAnchors indexes_to_anchors, std::vector &v_weight); diff --git a/src/ge/graph/passes/get_original_format_pass.cc b/src/ge/graph/passes/get_original_format_pass.cc index d6f795fb..5b7e84c2 100644 --- a/src/ge/graph/passes/get_original_format_pass.cc +++ b/src/ge/graph/passes/get_original_format_pass.cc @@ -33,6 +33,8 @@ using domi::FAILED; using domi::PARAM_INVALID; using domi::SUCCESS; +using domi::GetContext; + namespace ge { Status GetOriginalFormatPass::Run(ge::ComputeGraphPtr graph) { GE_CHECK_NOTNULL(graph); diff --git a/src/ge/graph/passes/guarantee_const_pass.cc b/src/ge/graph/passes/guarantee_const_pass.cc index 761e6e16..f099c01d 100644 --- a/src/ge/graph/passes/guarantee_const_pass.cc +++ b/src/ge/graph/passes/guarantee_const_pass.cc @@ -45,9 +45,11 @@ Status GuaranteeConstPass::Run(NodePtr &node) { GELOGE(PARAM_INVALID, "input size error. Input size:%zu", node->GetOpDesc()->GetAllInputsDesc().size()); return PARAM_INVALID; } - auto inDesc = node->GetOpDesc()->GetInputDesc(0); + // [Cascade pointer] + const auto &in_desc = node->GetOpDesc()->MutableInputDesc(0); + GE_CHECK_NOTNULL(in_desc); // Input tensor cannot be a resource variable handle. - const DataType input_dtype = inDesc.GetDataType(); + const DataType &input_dtype = in_desc->GetDataType(); if (input_dtype == DT_RESOURCE) { GELOGE(FAILED, "Input tensor cannot be a resource variable handle in [%s].", node->GetName().c_str()); return FAILED; diff --git a/src/ge/graph/passes/hccl_memcpy_pass.cc b/src/ge/graph/passes/hccl_memcpy_pass.cc index 4588c1c2..ac037d62 100644 --- a/src/ge/graph/passes/hccl_memcpy_pass.cc +++ b/src/ge/graph/passes/hccl_memcpy_pass.cc @@ -26,7 +26,7 @@ #include "graph/utils/graph_utils.h" namespace { -const size_t kAnchorSize = 1; +const int32_t kAnchorSize = 1; const int kAnchorNum = 0; } // namespace namespace ge { @@ -47,36 +47,26 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); GE_CHECK_NOTNULL(src_out_anchor); - size_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size(); - if (src_out_anchor_size <= kAnchorSize) { - GELOGI("Data op only link to hcom op, no need to add memcpy async node."); + int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size(); + if (src_out_anchor_size == kAnchorSize) { + // Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. + NodePtr src_node = src_out_anchor->GetOwnerNode(); + std::string src_type = src_node->GetType(); + bool check_src_type = (src_type == CONSTANTOP) || (src_type == DATA); + if (check_src_type && node->GetType() == HCOMALLREDUCE) { + Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); + return ret; + } + } continue; } - GELOGI("The op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str()); - NodePtr memcpy_node = CreateMemcpyNode(graph, src_out_anchor); - GE_CHECK_NOTNULL(memcpy_node); - - Status ret1 = src_out_anchor->Unlink(hccl_in_anchor); - if (ret1 != SUCCESS) { - GELOGE(INTERNAL_ERROR, "The op %s Unlink anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), - node->GetName().c_str()); - return FAILED; - } - auto out_data_anchor_0 = memcpy_node->GetOutDataAnchor(kAnchorNum); - GE_CHECK_NOTNULL(out_data_anchor_0); - ret1 = out_data_anchor_0->LinkTo(hccl_in_anchor); - if (ret1 != SUCCESS) { - GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", memcpy_node->GetName().c_str(), - node->GetName().c_str()); - return FAILED; - } - - Status ret = src_out_anchor->LinkTo(memcpy_node->GetInDataAnchor(kAnchorNum)); + Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), - memcpy_node->GetName().c_str()); - return FAILED; + GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); + return ret; } } } @@ -154,4 +144,42 @@ bool HcclMemcpyPass::NeedInsertMemcpyOp(const ge::ConstOpDescPtr &op_desc) const return (op_desc->GetType() == HCOMALLGATHER || op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER); } + +/// +/// @brief Modify edge connection +/// @param [in] ComputeGraphPtr graph +/// @param [in] OutDataAnchorPtr src_out_anchor +/// @param [in] InDataAnchorPtr hccl_in_anchor +/// @return status +/// +Status HcclMemcpyPass::ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, + const InDataAnchorPtr &hccl_in_anchor) { + GELOGI("The op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str()); + NodePtr memcpy_node = CreateMemcpyNode(graph, src_out_anchor); + GE_CHECK_NOTNULL(memcpy_node); + + Status ret1 = src_out_anchor->Unlink(hccl_in_anchor); + if (ret1 != SUCCESS) { + GELOGE(INTERNAL_ERROR, "The op %s Unlink anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), + hccl_in_anchor->GetOwnerNode()->GetName().c_str()); + return FAILED; + } + auto out_data_anchor_0 = memcpy_node->GetOutDataAnchor(kAnchorNum); + GE_CHECK_NOTNULL(out_data_anchor_0); + ret1 = out_data_anchor_0->LinkTo(hccl_in_anchor); + if (ret1 != SUCCESS) { + GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", memcpy_node->GetName().c_str(), + hccl_in_anchor->GetOwnerNode()->GetName().c_str()); + return FAILED; + } + + Status ret = src_out_anchor->LinkTo(memcpy_node->GetInDataAnchor(kAnchorNum)); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "The op %s link anchor %s fail.", src_out_anchor->GetOwnerNode()->GetName().c_str(), + memcpy_node->GetName().c_str()); + return FAILED; + } + return SUCCESS; +} + } // namespace ge diff --git a/src/ge/graph/passes/hccl_memcpy_pass.h b/src/ge/graph/passes/hccl_memcpy_pass.h index f4762980..4c4e8ae5 100644 --- a/src/ge/graph/passes/hccl_memcpy_pass.h +++ b/src/ge/graph/passes/hccl_memcpy_pass.h @@ -35,6 +35,9 @@ class HcclMemcpyPass : public GraphPass { bool NeedInsertMemcpyOp(const ge::ConstOpDescPtr &op_desc) const; + Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, + const InDataAnchorPtr &hccl_in_anchor); + std::unordered_map node_num_map_; }; } // namespace ge diff --git a/src/ge/graph/passes/infershape_pass.cc b/src/ge/graph/passes/infershape_pass.cc index ae9ec039..18767cea 100644 --- a/src/ge/graph/passes/infershape_pass.cc +++ b/src/ge/graph/passes/infershape_pass.cc @@ -22,7 +22,8 @@ namespace ge { Status InferShapePass::Run(NodePtr &node) { - if (ShapeRefiner::InferShapeAndType(node) != GRAPH_SUCCESS) { + auto ret = ShapeRefiner::InferShapeAndType(node, !OptionExists(kOptimizeAfterSubGraph)); + if (ret != GRAPH_SUCCESS) { GELOGE(GE_GRAPH_INFERSHAPE_FAILED, "infershape failed. node: %s", node->GetName().c_str()); return GE_GRAPH_INFERSHAPE_FAILED; } diff --git a/src/ge/graph/passes/link_gen_mask_nodes_pass.cc b/src/ge/graph/passes/link_gen_mask_nodes_pass.cc index 16e8e3d5..ff150a54 100644 --- a/src/ge/graph/passes/link_gen_mask_nodes_pass.cc +++ b/src/ge/graph/passes/link_gen_mask_nodes_pass.cc @@ -17,9 +17,6 @@ #include "graph/passes/link_gen_mask_nodes_pass.h" #include -#include -#include -#include #include "common/ge_inner_error_codes.h" #include "framework/common/debug/ge_log.h" @@ -71,8 +68,8 @@ Status LinkGenMaskNodesPass::Run(ComputeGraphPtr graph) { auto dest_anchor = dest_node->GetInControlAnchor(); GE_CHECK_NOTNULL(dest_anchor); - graphStatus status = src_anchor->LinkTo(dest_anchor); - if (status != GRAPH_SUCCESS) { + graphStatus status_link_to = src_anchor->LinkTo(dest_anchor); + if (status_link_to != GRAPH_SUCCESS) { GELOGE(FAILED, "Link from %s to %s failed.", src_node->GetName().c_str(), dest_node->GetName().c_str()); return FAILED; } @@ -93,7 +90,7 @@ bool LinkGenMaskNodesPass::AreAllInputsConst(const NodePtr &node) const { return true; } -void LinkGenMaskNodesPass::GetAllGenMaskNodes(const ComputeGraphPtr &graph, vector &gen_mask_nodes) const { +void LinkGenMaskNodesPass::GetAllGenMaskNodes(ComputeGraphPtr graph, vector &gen_mask_nodes) const { set nodes_set; for (const NodePtr &node : graph->GetDirectNode()) { if (node->GetType() != DROPOUTDOMASK) { diff --git a/src/ge/graph/passes/link_gen_mask_nodes_pass.h b/src/ge/graph/passes/link_gen_mask_nodes_pass.h index 3d37d3e8..f9979ab1 100644 --- a/src/ge/graph/passes/link_gen_mask_nodes_pass.h +++ b/src/ge/graph/passes/link_gen_mask_nodes_pass.h @@ -17,9 +17,9 @@ #ifndef GE_GRAPH_PASSES_LINK_GEN_MASK_NODES_PASS_H_ #define GE_GRAPH_PASSES_LINK_GEN_MASK_NODES_PASS_H_ -#include #include #include +#include #include "graph/graph.h" #include "inc/graph_pass.h" @@ -28,7 +28,7 @@ namespace ge { // Link all GenMask nodes using control edges. class LinkGenMaskNodesPass : public GraphPass { public: - explicit LinkGenMaskNodesPass(const std::map &stream_max_parallel_num); + LinkGenMaskNodesPass(const std::map &stream_max_parallel_num); ~LinkGenMaskNodesPass() override = default; LinkGenMaskNodesPass(const LinkGenMaskNodesPass &) = delete; LinkGenMaskNodesPass &operator=(const LinkGenMaskNodesPass &) = delete; @@ -37,7 +37,7 @@ class LinkGenMaskNodesPass : public GraphPass { private: bool AreAllInputsConst(const NodePtr &node) const; - void GetAllGenMaskNodes(const ComputeGraphPtr &graph, std::vector &gen_mask_nodes) const; + void GetAllGenMaskNodes(ComputeGraphPtr graph, std::vector &gen_mask_nodes) const; Status GetGenMaskGroupSize(std::vector &gen_mask_nodes, size_t &gen_mask_group_size) const; const std::map stream_max_parallel_num_; diff --git a/src/ge/graph/passes/net_output_pass.h b/src/ge/graph/passes/net_output_pass.h index 62287e88..6c86d8ef 100644 --- a/src/ge/graph/passes/net_output_pass.h +++ b/src/ge/graph/passes/net_output_pass.h @@ -18,7 +18,6 @@ #define GE_GRAPH_PASSES_NET_OUTPUT_PASS_H_ #include -#include #include #include #include @@ -177,7 +176,7 @@ class NetOutputPass : public GraphPass { /// Status ProcessWithNetoutput(const ge::ComputeGraphPtr &graph, const ge::NodePtr &output_node); /// - /// check node whether exist in user-set output nodes + /// check node wether exist in user-set output nodes /// @param [in] graph: ComputeGraph /// @param [in] net_out_node: The netOutput node /// @return SUCCESS: Execution succeed diff --git a/src/ge/graph/passes/next_iteration_pass.h b/src/ge/graph/passes/next_iteration_pass.h index 47a86b3a..aefcc0f5 100644 --- a/src/ge/graph/passes/next_iteration_pass.h +++ b/src/ge/graph/passes/next_iteration_pass.h @@ -17,7 +17,6 @@ #ifndef GE_GRAPH_PASSES_NEXT_ITERATION_PASS_H_ #define GE_GRAPH_PASSES_NEXT_ITERATION_PASS_H_ -#include #include #include #include diff --git a/src/ge/graph/passes/no_reshape_op_remove_pass.cc b/src/ge/graph/passes/no_reshape_op_remove_pass.cc deleted file mode 100644 index 59f4eae3..00000000 --- a/src/ge/graph/passes/no_reshape_op_remove_pass.cc +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "graph/passes/no_reshape_op_remove_pass.h" - -#include -#include - -#include "framework/common/debug/ge_log.h" -#include "common/op/attr_value_util.h" -#include "common/op/ge_op_utils.h" -#include "common/types.h" -#include "common/util.h" -#include "framework/common/ge_inner_error_codes.h" -#include "graph/utils/graph_utils.h" -#include "graph/utils/op_desc_utils.h" -#include "graph/utils/tensor_utils.h" -#include "framework/omg/omg_inner_types.h" - -namespace ge { -namespace { -const char *const kReshapeName = "Reshape_3"; -} // namespace -Status NoReshapeOpRemovePass::Run(ge::NodePtr &node) { - if (node == nullptr) { - GELOGE(PARAM_INVALID, "param [node] must not be null"); - return PARAM_INVALID; - } - OpDescPtr op_desc_ptr = node->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc_ptr); - if ((op_desc_ptr->GetType() == EXPANDDIMS) || (op_desc_ptr->GetType() == SQUEEZE)) { - return CheckNodeShapeAndForamt(node); - } else if (op_desc_ptr->GetType() == RESHAPE) { - if (op_desc_ptr->GetName() == kReshapeName) { - NodePtr out_data_node; - NodePtr reshape_out_node; - std::vector types; - std::list path; - path.push_back(node); - types.emplace_back(PERMUTE); - types.emplace_back(TRANSDATA); - types.emplace_back(CORRELATION); - // check reshape out data node fit specific type - bool reshape_correlation_flag = true; - for (size_t i = 0; i < types.size(); i++) { - if (!CheckOutDataNodesType(types[i], path)) { - reshape_correlation_flag = false; - break; - } - } - if (reshape_correlation_flag) { - path.pop_front(); - GE_IF_BOOL_EXEC(!AttrUtils::SetBool(path.front()->GetOpDesc(), "reshape_correlation", reshape_correlation_flag), - GELOGE(INTERNAL_ERROR, "set reshape_correlation failed"); - return INTERNAL_ERROR); - } - path.clear(); - types.clear(); - } - - if (domi::GetContext().format == domi::DOMI_TENSOR_NCHW && !op_desc_ptr->HasAttr(PERMUTE_ATTR_ORDER)) { - std::list path; - path.push_back(node); - string correlation = CORRELATION; - if (CheckOutDataNodesType(correlation, path)) { - op_desc_ptr->SetType(PERMUTE); - if (AttrUtils::SetListInt(op_desc_ptr, PERMUTE_ATTR_ORDER, vector{2, 3, 0, 1})) { - GELOGE(INTERNAL_ERROR, "Set permute attr order failed"); - return INTERNAL_ERROR; - } - path.clear(); - return SUCCESS; - } - } - - // prefer handle linked reshape than single reshape - vector delete_nodes = CheckLinkedReshape(node); - if (delete_nodes.empty()) { - return CheckNodeShapeAndForamt(node); - } - Status ret; - for (NodePtr &delete_node : delete_nodes) { - GE_CHECK_NOTNULL(delete_node); - GELOGI("NoReshapeOpRemovePass remove node:%s", delete_node->GetName().c_str()); - ret = IsolateAndDeleteNode(delete_node, {0}); - if (ret != SUCCESS) { - GELOGE(ret, "NoReshapeOpRemovePass remove node failed,ret:%u", ret); - return ret; - } - } - } - return SUCCESS; -} - -bool NoReshapeOpRemovePass::CheckOutDataNodesType(const string &type, std::list &path) { - if (path.empty()) { - return false; - } - Node::Vistor out_data_nodes = path.back()->GetOutDataNodes(); - bool flag = false; - GE_IF_BOOL_EXEC(out_data_nodes.at(0)->GetOpDesc() == nullptr, GELOGE(FAILED, "out_data_nodes GetOpDesc is nullptr"); - return false); - if ((out_data_nodes.size() == 1) && (out_data_nodes.at(0)->GetOpDesc()->GetType() == type)) { - path.push_back(out_data_nodes.at(0)); - flag = true; - } - return flag; -} - -// if single node input and output shape is same can be delete -Status NoReshapeOpRemovePass::CheckNodeShapeAndForamt(ge::NodePtr &node) { - bool to_be_deleted = false; - OpDescPtr op_desc_ptr = node->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc_ptr); - if (op_desc_ptr->GetAllInputsDescPtr().empty()) { - GELOGE(INTERNAL_ERROR, "Input num check fail. node name:%s", op_desc_ptr->GetName().c_str()); - return INTERNAL_ERROR; - } - GE_CHECK_NOTNULL(op_desc_ptr->GetInputDescPtr(0)); - if (op_desc_ptr->GetInputDescPtr(0)->GetFormat() == FORMAT_ND) { - to_be_deleted = true; - } else { - to_be_deleted = true; - // compare input and output dims - std::vector input_4dims; - GE_CHK_STATUS_RET(OpUtils::TransferDim(op_desc_ptr->GetInputDesc(0).GetShape().GetDims(), input_4dims), - "transfer dim failed"); - - std::vector output_4dims; - GE_CHK_STATUS_RET(OpUtils::TransferDim(op_desc_ptr->GetOutputDesc(0).GetShape().GetDims(), output_4dims), - "transfer dim failed"); - - size_t vec_size = (input_4dims.size() > output_4dims.size()) ? output_4dims.size() : input_4dims.size(); - - for (size_t i = 0; i < vec_size; i++) { - if (input_4dims[i] != output_4dims[i]) { - to_be_deleted = false; - break; - } - } - } - if (to_be_deleted) { - GELOGI("NoReshapeOpRemovePass remove node:%s", node->GetName().c_str()); - return IsolateAndDeleteNode(node, {0}); - } - return SUCCESS; -} - -// check Reshape->Reshape linked case if can be delete -vector NoReshapeOpRemovePass::CheckLinkedReshape(ge::NodePtr &node) { - std::list node_path; - std::vector delete_nodes; - GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, GELOGE(FAILED, "Node OpDesc is nullptr"); return delete_nodes); - const auto &cur_input_desc = node->GetOpDesc()->GetInputDesc(0); - vector cur_input_dims = cur_input_desc.GetShape().GetDims(); - Format cur_input_format = cur_input_desc.GetFormat(); - node_path.push_back(node); - // from front to back find longest sequence reshape can be delete - while (!node_path.empty()) { - const auto src_node = node_path.back(); - if (src_node == nullptr) { - continue; - } - Node::Vistor out_data_nodes = src_node->GetOutDataNodes(); - if ((out_data_nodes.size() == 1) && (out_data_nodes.at(0)->GetOpDesc() != nullptr) && - (out_data_nodes.at(0)->GetOpDesc()->GetType() == RESHAPE)) { - NodePtr dst_node = out_data_nodes.at(0); - node_path.push_back(dst_node); - GeTensorDesc dst_output_desc = dst_node->GetOpDesc()->GetOutputDesc(0); - vector dst_output_dims = dst_output_desc.GetShape().GetDims(); - if ((cur_input_dims.size() == dst_output_dims.size()) && (cur_input_format == dst_output_desc.GetFormat())) { - bool is_reshape_delete = true; - for (size_t i = 0; i < cur_input_dims.size(); i++) { - if (cur_input_dims[i] != dst_output_dims[i]) { - is_reshape_delete = false; - } - } - if (is_reshape_delete) { - delete_nodes.insert(delete_nodes.begin(), node_path.begin(), node_path.end()); - } - } - } else { - break; - } - } - node_path.clear(); - return delete_nodes; -} -} // namespace ge diff --git a/src/ge/graph/passes/no_reshape_op_remove_pass.h b/src/ge/graph/passes/no_reshape_op_remove_pass.h deleted file mode 100644 index 8bbc6d0a..00000000 --- a/src/ge/graph/passes/no_reshape_op_remove_pass.h +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GE_GRAPH_PASSES_NO_RESHAPE_OP_REMOVE_PASS_H_ -#define GE_GRAPH_PASSES_NO_RESHAPE_OP_REMOVE_PASS_H_ - -#include -#include -#include - -#include "graph/passes/base_pass.h" - -namespace ge { -class NoReshapeOpRemovePass : public BaseNodePass { - public: - /// - /// Entry of the NoReshapeOpRemovePass optimizer - /// @param [in] node: Input Node - /// @return SUCCESS: Dont find need to delete node - /// @return NOT_CHANGED: find need to delete node - /// @return OTHERS: Execution failed - /// @author - /// - Status Run(ge::NodePtr &node) override; - - private: - /// - /// check node input and output dims and format if can be delete - /// @param [in] opDescPtr: To be checked opDesc - /// @return SUCCESS: Check Node Success - /// @return OTHERS: Check Node Failed - /// @author - /// - Status CheckNodeShapeAndForamt(ge::NodePtr &node); - - /// - /// check linked reshape op if can be delete - /// @param [in] node: To be compare Node with opDescPtr - /// @return vector: To be delete nodes - /// @author - /// - vector CheckLinkedReshape(ge::NodePtr &node); - - /// - /// check node input and output dims and format if can be delete - /// @param [in] type: Check type - /// @param [in/out] path: outnode list - /// @return TRUE: To be delete - /// @return FALSE: To be Not delete - /// @author - /// - bool CheckOutDataNodesType(const string &type, std::list &path); -}; -} // namespace ge -#endif // GE_GRAPH_PASSES_NO_RESHAPE_OP_REMOVE_PASS_H_ diff --git a/src/ge/graph/passes/no_use_reshape_remove_pass.cc b/src/ge/graph/passes/no_use_reshape_remove_pass.cc index 3e124264..5ae422ca 100644 --- a/src/ge/graph/passes/no_use_reshape_remove_pass.cc +++ b/src/ge/graph/passes/no_use_reshape_remove_pass.cc @@ -47,8 +47,18 @@ Status NoUseReshapeRemovePass::Run(ge::NodePtr &node) { bool to_be_deleted = true; // compare input and output dims - std::vector input_4dims = op_desc_ptr->GetInputDesc(0).GetShape().GetDims(); - std::vector output_4dims = op_desc_ptr->GetOutputDesc(0).GetShape().GetDims(); + if (op_desc_ptr->GetAllInputsDesc().empty() || op_desc_ptr->GetAllOutputsDesc().empty()) { + GELOGE(INTERNAL_ERROR, "Input or output num is zero. node name:%s, input size:%zu, output size:%zu", + op_desc_ptr->GetName().c_str(), op_desc_ptr->GetAllInputsDesc().size(), + op_desc_ptr->GetAllOutputsDesc().size()); + return INTERNAL_ERROR; + } + const auto &input_desc = op_desc_ptr->MutableInputDesc(0); + const auto &output_desc = op_desc_ptr->MutableOutputDesc(0); + GE_CHECK_NOTNULL(input_desc); + GE_CHECK_NOTNULL(output_desc); + std::vector input_4dims = input_desc->GetShape().GetDims(); + std::vector output_4dims = output_desc->GetShape().GetDims(); if (input_4dims.size() != output_4dims.size()) { GELOGI("Input and output dim size is not equal.Keep this reshape op."); diff --git a/src/ge/graph/passes/pass_manager.cc b/src/ge/graph/passes/pass_manager.cc index 6cfcfe6b..f62ea160 100644 --- a/src/ge/graph/passes/pass_manager.cc +++ b/src/ge/graph/passes/pass_manager.cc @@ -46,9 +46,19 @@ Status PassManager::Run(const ComputeGraphPtr &graph, vector &passe if (status == SUCCESS) { not_changed = false; } else if (status != NOT_CHANGED) { - GELOGE(status, "Pass Run failed"); + GELOGE(status, "Pass Run failed on graph %s", graph->GetName().c_str()); return status; } + for (const auto &subgraph : graph->GetAllSubgraphs()) { + GE_CHECK_NOTNULL(subgraph); + status = pass->Run(subgraph); + if (status == SUCCESS) { + not_changed = false; + } else if (status != NOT_CHANGED) { + GELOGE(status, "Pass Run failed on subgraph %s", subgraph->GetName().c_str()); + return status; + } + } } return not_changed ? NOT_CHANGED : SUCCESS; diff --git a/src/ge/graph/passes/pass_utils.h b/src/ge/graph/passes/pass_utils.h index b80e05f1..a8b1cfe3 100644 --- a/src/ge/graph/passes/pass_utils.h +++ b/src/ge/graph/passes/pass_utils.h @@ -18,7 +18,6 @@ #define GE_GRAPH_PASSES_PASS_UTILS_H_ #include - #include "framework/common/debug/ge_log.h" #include "common/ge_inner_error_codes.h" #include "graph/compute_graph.h" @@ -37,7 +36,7 @@ class PassUtils { static Status RemoveBranch(const NodePtr &node, std::vector &delete_nodes, std::vector &end_nodes); static Status RemoveInactiveBranchToMerge(const OutDataAnchorPtr &inactive_output_anchor, - std::vector &delete_nodes, std::vector &end_nodes); + std::vector &delete_nodes, std::vector &end_nodes); /// /// check is need iter flow ctrl. diff --git a/src/ge/graph/passes/permute_pass.cc b/src/ge/graph/passes/permute_pass.cc index d541e66a..c2ce5465 100644 --- a/src/ge/graph/passes/permute_pass.cc +++ b/src/ge/graph/passes/permute_pass.cc @@ -15,10 +15,8 @@ */ #include "graph/passes/permute_pass.h" - #include #include - #include "common/debug/log.h" #include "common/types.h" #include "graph/utils/attr_utils.h" @@ -34,100 +32,91 @@ using domi::SUCCESS; namespace ge { Status PermutePass::Run(ComputeGraphPtr graph) { + GE_TIMESTAMP_START(PermutePass); GE_CHECK_NOTNULL(graph); std::vector isolate_nodes; for (NodePtr &node : graph->GetAllNodes()) { OpDescPtr op_desc_ptr = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc_ptr); GE_IF_BOOL_EXEC( - op_desc_ptr->GetType() == PERMUTE && GetContext().type == domi::FMK_TYPE_T, - /// Input format 5D means NHWC in 4D way. So if input origin foramt is NCHW and - /// permute paramter list is [0,3,1,2], this permute can be optimised. - GE_IF_BOOL_EXEC( - GetContext().format != DOMI_TENSOR_ND, - // Get input origin foramt - for (NodePtr &n : graph->GetAllNodes()) { + op_desc_ptr->GetType() == PERMUTE && GetContext().type == domi::FMK_TYPE_T, + /// Input format 5D means NHWC in 4D way. So if input origin foramt is NCHW and + /// permute paramter list is [0,3,1,2], this permute can be optimised. GE_IF_BOOL_EXEC( - n->GetOpDesc()->GetType() == PERMUTE, std::queue < NodePtr > q_node; - q_node.push(n); - bool jump_out = false; - while (!q_node.empty()) { - NodePtr n_temp = q_node.back(); - q_node.pop(); - for (auto &inNode : n_temp->GetInDataNodes()) { - int64_t cur_format = 0; - GE_IF_BOOL_EXEC(AttrUtils::GetInt(inNode->GetOpDesc(), ATTR_NAME_FORMAT, cur_format), - GE_IF_BOOL_EXEC(!AttrUtils::SetInt(n->GetOpDesc(), "permute_src_format", cur_format), - GELOGW("set permute_src_format failed"); - continue); - jump_out = true; - break); - q_node.push(inNode); + GetContext().format != DOMI_TENSOR_ND, + // Get input origin foramt + for (NodePtr &n + : graph->GetAllNodes()) { + GE_IF_BOOL_EXEC( + n->GetOpDesc()->GetType() == PERMUTE, std::queue q_node; q_node.push(n); bool jump_out = false; + while (!q_node.empty()) { + NodePtr n_temp = q_node.back(); + q_node.pop(); + for (auto &inNode : n_temp->GetInDataNodes()) { + int64_t cur_format = 0; + GE_IF_BOOL_EXEC(AttrUtils::GetInt(inNode->GetOpDesc(), ATTR_NAME_FORMAT, cur_format), + GE_IF_BOOL_EXEC(!AttrUtils::SetInt(n->GetOpDesc(), "permute_src_format", cur_format), + GELOGW("set permute_src_format failed"); + continue); + jump_out = true; break); + q_node.push(inNode); + } + GE_IF_BOOL_EXEC(jump_out, break); + }); } - GE_IF_BOOL_EXEC(jump_out, - break); - }); - } - int64_t permute_src_format = 0; - GE_IF_BOOL_EXEC(!AttrUtils::GetInt(op_desc_ptr, "permute_src_format", permute_src_format), - continue); - // Get dim_index_ - std::vector index_list; - GE_CHK_BOOL_RET_STATUS(AttrUtils::GetListInt(op_desc_ptr, PERMUTE_ATTR_ORDER, index_list), INTERNAL_ERROR, - "get index list failed"); + int64_t permute_src_format = 0; + GE_IF_BOOL_EXEC(!AttrUtils::GetInt(op_desc_ptr, "permute_src_format", permute_src_format), continue); + // Get dim_index_ + std::vector index_list; GE_CHK_BOOL_RET_STATUS( + AttrUtils::GetListInt(op_desc_ptr, PERMUTE_ATTR_ORDER, index_list), INTERNAL_ERROR, "get index list failed"); - size_t index_size = index_list.size(); - GE_IF_BOOL_EXEC(index_size == 0, - continue); + size_t index_size = index_list.size(); GE_IF_BOOL_EXEC(index_size == 0, continue); - GE_IF_BOOL_EXEC( - index_size == 4 && (permute_src_format == DOMI_TENSOR_NHWC && index_list.at(0) == 0 && - index_list.at(1) == 3 && index_list.at(2) == 1 && index_list.at(3) == 2), - isolate_nodes.push_back(node); - continue); - int64_t conv_format = 0; - GE_IF_BOOL_EXEC( - index_size == 4 && + GE_IF_BOOL_EXEC(index_size == 4 && (permute_src_format == DOMI_TENSOR_NHWC && index_list.at(0) == 0 && + index_list.at(1) == 3 && index_list.at(2) == 1 && index_list.at(3) == 2), + isolate_nodes.push_back(node); + continue); + int64_t conv_format = 0; GE_IF_BOOL_EXEC( + index_size == 4 && (index_list.at(0) == 0 && index_list.at(1) == 2 && index_list.at(2) == 3 && index_list.at(3) == 1), - GE_IF_BOOL_EXEC( + GE_IF_BOOL_EXEC( (node->GetOutDataNodesSize() > 0 && node->GetOutDataNodes().at(0) != nullptr && - node->GetOutDataNodes().at(0)->GetOpDesc() != nullptr) && - ((node->GetOutDataNodesSize() != 0 && - CONVOLUTION == node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() && - AttrUtils::GetInt(node->GetOutDataNodes().at(0)->GetOpDesc(), ATTR_NAME_FORMAT, - conv_format) && - conv_format == DOMI_TENSOR_NHWC) || - (node->GetOutDataNodesSize() != 0 && - node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == DEPCONVOLUTION) || - (node->GetOutDataNodesSize() != 0 && - node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == DECONVOLUTION) || - (node->GetOutDataNodesSize() != 0 && - node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == PAD && - node->GetOutDataNodes().at(0)->GetOutDataNodesSize() != 0 && - node->GetOutDataNodes().at(0)->GetOutDataNodes().at(0) != nullptr && - node->GetOutDataNodes().at(0)->GetOutDataNodes().at(0)->GetOpDesc() != nullptr && - node->GetOutDataNodes().at(0)->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == - CONVOLUTION)), + node->GetOutDataNodes().at(0)->GetOpDesc() != nullptr) && + ((node->GetOutDataNodesSize() != 0 && + CONVOLUTION == node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() && + AttrUtils::GetInt(node->GetOutDataNodes().at(0)->GetOpDesc(), ATTR_NAME_FORMAT, conv_format) && + conv_format == DOMI_TENSOR_NHWC) || + (node->GetOutDataNodesSize() != 0 && + node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == DEPCONVOLUTION) || + (node->GetOutDataNodesSize() != 0 && + node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == DECONVOLUTION) || + (node->GetOutDataNodesSize() != 0 && node->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == PAD && + node->GetOutDataNodes().at(0)->GetOutDataNodesSize() != 0 && + node->GetOutDataNodes().at(0)->GetOutDataNodes().at(0) != nullptr && + node->GetOutDataNodes().at(0)->GetOutDataNodes().at(0)->GetOpDesc() != nullptr && + node->GetOutDataNodes().at(0)->GetOutDataNodes().at(0)->GetOpDesc()->GetType() == CONVOLUTION)), isolate_nodes.push_back(node); - continue);););); + continue);););); } - GE_IF_BOOL_EXEC(isolate_nodes.size() != 0, - for (auto &node : isolate_nodes) { - // Adding an attribute indicates that the predecessor Permute has been deleted for the Builder to process. - for (auto &outNode : node->GetOutDataNodes()) { - OpDescPtr op_desc_ptr = outNode->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc_ptr); - if (!AttrUtils::SetBool(op_desc_ptr, ATTR_NAME_PRED_PERMUTE_DELETED, true)) { - GELOGE(INTERNAL_ERROR, "set ATTR_NAME_PRED_PERMUTE_DELETED failed"); - return INTERNAL_ERROR; + GE_IF_BOOL_EXEC( + isolate_nodes.size() != 0, for (auto &node + : isolate_nodes) { + // Adding an attribute indicates that the predecessor Permute has been deleted for the Builder to process. + for (auto &outNode : node->GetOutDataNodes()) { + OpDescPtr op_desc_ptr = outNode->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc_ptr); + if (!AttrUtils::SetBool(op_desc_ptr, ATTR_NAME_PRED_PERMUTE_DELETED, true)) { + GELOGE(INTERNAL_ERROR, "set ATTR_NAME_PRED_PERMUTE_DELETED failed"); + return INTERNAL_ERROR; + } } - } - GE_RETURN_WITH_LOG_IF_ERROR(graph->RemoveNode(node), "[%s]:remove permute node failed", - node->GetOpDesc()->GetName().c_str()); - }); + GE_RETURN_WITH_LOG_IF_ERROR(graph->RemoveNode(node), "[%s]:remove permute node failed", + node->GetOpDesc()->GetName().c_str()); + }); + GE_TIMESTAMP_END(PermutePass, "GraphManager::PermutePass"); return SUCCESS; } } // namespace ge diff --git a/src/ge/graph/passes/placeholder_with_default_pass.cc b/src/ge/graph/passes/placeholder_with_default_pass.cc index 4a8ded9c..7a72fc36 100644 --- a/src/ge/graph/passes/placeholder_with_default_pass.cc +++ b/src/ge/graph/passes/placeholder_with_default_pass.cc @@ -15,9 +15,7 @@ */ #include "graph/passes/placeholder_with_default_pass.h" - #include - #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "graph/common/omg_util.h" diff --git a/src/ge/graph/passes/prevent_gradient_pass.cc b/src/ge/graph/passes/prevent_gradient_pass.cc index ff4f3cc7..87c1b3a1 100644 --- a/src/ge/graph/passes/prevent_gradient_pass.cc +++ b/src/ge/graph/passes/prevent_gradient_pass.cc @@ -17,7 +17,6 @@ #include "graph/passes/prevent_gradient_pass.h" #include - #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "graph/common/omg_util.h" diff --git a/src/ge/graph/passes/print_op_pass.cc b/src/ge/graph/passes/print_op_pass.cc index c0eedc1f..fba7b712 100644 --- a/src/ge/graph/passes/print_op_pass.cc +++ b/src/ge/graph/passes/print_op_pass.cc @@ -15,7 +15,6 @@ */ #include "graph/passes/print_op_pass.h" - #include namespace ge { diff --git a/src/ge/graph/passes/prune_pass.cc b/src/ge/graph/passes/prune_pass.cc index b57c52ec..f7d09740 100644 --- a/src/ge/graph/passes/prune_pass.cc +++ b/src/ge/graph/passes/prune_pass.cc @@ -15,11 +15,10 @@ */ #include "graph/passes/prune_pass.h" - #include +#include #include #include - #include "common/debug/log.h" #include "common/types.h" #include "framework/common/debug/ge_log.h" diff --git a/src/ge/graph/passes/replace_with_empty_const_pass.cc b/src/ge/graph/passes/replace_with_empty_const_pass.cc new file mode 100644 index 00000000..b76b2cc9 --- /dev/null +++ b/src/ge/graph/passes/replace_with_empty_const_pass.cc @@ -0,0 +1,156 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/passes/replace_with_empty_const_pass.h" +#include +#include +#include "common/ge/ge_util.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/ge_inner_error_codes.h" +#include "graph/utils/graph_utils.h" + +namespace ge { +Status ReplaceWithEmptyConstPass::Run(NodePtr &node) { + GELOGD("ReplaceWithEmptyConstPass in."); + if (node == nullptr) { + GELOGE(PARAM_INVALID, "Parameter is null."); + return PARAM_INVALID; + } + if (node->GetOpDesc() == nullptr) { + GELOGE(PARAM_INVALID, "Param [opDesc] must not be null."); + return PARAM_INVALID; + } + // Node like no op, it has no output + if (node->GetOpDesc()->GetAllOutputsDescPtr().empty()) { + GELOGI("Node %s has no output desc. Ignore current pass.", node->GetName().c_str()); + return SUCCESS; + } + // If outputs of current node are all empty, replace it with empty const + bool is_all_output_empty = true; + for (const auto &output_desc_ptr : node->GetOpDesc()->GetAllOutputsDescPtr()) { + if (output_desc_ptr == nullptr) { + GELOGI("Node %s Got empty output_desc_ptr, ignore current pass.", node->GetName().c_str()); + return SUCCESS; + } + if (!IsEmptyTenor(output_desc_ptr->GetShape())) { + is_all_output_empty = false; + break; + } + } + if (is_all_output_empty) { + GELOGI("Node %s has empty tensor output. It will be replaced by empty const.", node->GetName().c_str()); + // Replace op which all output is empty with empty const + Status ret = ReplaceWithEmptyConst(node); + if (ret != SUCCESS) { + // If replace failed, it should not break whole process, so still return success + GELOGW("Failed to repalce node %s with empty const.", node->GetName().c_str()); + } + } + GELOGD("ReplaceWithEmptyConstPass end."); + return SUCCESS; +} + +Status ReplaceWithEmptyConstPass::ReplaceWithEmptyConst(NodePtr &node_to_replace) { + std::map> shape_out_idx_map; + auto op_desc = node_to_replace->GetOpDesc(); + // Collect out_idx follow different out shape + for (const auto &out_anchor : node_to_replace->GetAllOutDataAnchors()) { + auto out_desc = op_desc->GetOutputDesc(out_anchor->GetIdx()); + shape_out_idx_map[GetDimStr(out_desc.GetShape())].emplace_back(out_anchor->GetIdx()); + } + + for (const auto &shape_2_out_idx : shape_out_idx_map) { + // Create empty const + // The out_desc in one group should be same shape, so here only get first out_desc. its valid index. + auto out_desc = op_desc->GetOutputDesc(shape_2_out_idx.second[0]); + NodePtr const_node; + auto graph = node_to_replace->GetOwnerComputeGraph(); + Status ret = InsertEmptyConst(out_desc, const_node, graph); + if (ret != SUCCESS) { + GELOGE(FAILED, "Failed insert const node."); + return FAILED; + } + + // Repalce data anchors + if (GraphUtils::ReplaceNodeDataAnchors(const_node, node_to_replace, {}, shape_2_out_idx.second) != GRAPH_SUCCESS) { + GELOGE(FAILED, "[%s] ReplaceNodeAnchors failed.", node_to_replace->GetName().c_str()); + return FAILED; + } + // Copy in control edge + if (GraphUtils::CopyInCtrlEdges(node_to_replace, const_node) != GRAPH_SUCCESS) { + GELOGE(FAILED, "CopyInCtrlEdges from %s to %s failed.", node_to_replace->GetName().c_str(), + const_node->GetName().c_str()); + return FAILED; + } + // Copy out control edge + if (GraphUtils::CopyOutCtrlEdges(node_to_replace, const_node) != GRAPH_SUCCESS) { + GELOGE(FAILED, "CopyOutCtrlEdges from %s to %s failed.", node_to_replace->GetName().c_str(), + const_node->GetName().c_str()); + return FAILED; + } + GELOGI("Node %s has been replaced by empty const %s.", node_to_replace->GetName().c_str(), + const_node->GetName().c_str()); + } + // Unlink control edge from node_to_replace to graph + if (node_to_replace->GetInControlAnchor() != nullptr) { + node_to_replace->GetInControlAnchor()->UnlinkAll(); + } + if (node_to_replace->GetOutControlAnchor() != nullptr) { + node_to_replace->GetOutControlAnchor()->UnlinkAll(); + } + return SUCCESS; +} +Status ReplaceWithEmptyConstPass::InsertEmptyConst(const GeTensorDesc &out_desc, NodePtr &const_node, + ComputeGraphPtr &graph) { + GeTensorPtr empty_tensor = MakeShared(); + if (empty_tensor == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed create empty tensor."); + return OUT_OF_MEMORY; + } + empty_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType()); + empty_tensor->MutableTensorDesc().SetFormat(out_desc.GetFormat()); + empty_tensor->MutableTensorDesc().SetShape(out_desc.GetShape()); + auto const_desc = OpDescUtils::CreateConstOp(empty_tensor); + if (const_desc == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed to get const desc from tensor"); + return OUT_OF_MEMORY; + } + + const_node = graph->AddNode(const_desc); + if (const_node == nullptr) { + GELOGE(FAILED, "Failed insert const node."); + return FAILED; + } + return SUCCESS; +} + +bool ReplaceWithEmptyConstPass::IsEmptyTenor(const GeShape &shape) const { + for (auto dim : shape.GetDims()) { + if (dim == 0) { + return true; + } + } + return false; +} + +string ReplaceWithEmptyConstPass::GetDimStr(const GeShape &shape) { + std::stringstream dim_str; + for (auto dim : shape.GetDims()) { + dim_str << dim << '-'; + } + return dim_str.str(); +} +} // namespace ge diff --git a/src/ge/graph/passes/replace_with_empty_const_pass.h b/src/ge/graph/passes/replace_with_empty_const_pass.h new file mode 100644 index 00000000..495b75b3 --- /dev/null +++ b/src/ge/graph/passes/replace_with_empty_const_pass.h @@ -0,0 +1,34 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_PASSES_REPLACE_WITH_EMPTY_CONST_PASS_H_ +#define GE_GRAPH_PASSES_REPLACE_WITH_EMPTY_CONST_PASS_H_ + +#include "graph/passes/base_pass.h" + +namespace ge { +class ReplaceWithEmptyConstPass : public BaseNodePass { + public: + Status Run(NodePtr &node) override; + + private: + Status ReplaceWithEmptyConst(NodePtr &node_to_replace); + Status InsertEmptyConst(const GeTensorDesc &out_desc, NodePtr &const_node, ComputeGraphPtr &graph); + bool IsEmptyTenor(const GeShape &shape) const; + std::string GetDimStr(const GeShape &shape); +}; +} // namespace ge +#endif // GE_GRAPH_PASSES_REPLACE_WITH_EMPTY_CONST_PASS_H_ diff --git a/src/ge/graph/passes/reshape_remove_pass.cc b/src/ge/graph/passes/reshape_remove_pass.cc index 0491270d..13865648 100644 --- a/src/ge/graph/passes/reshape_remove_pass.cc +++ b/src/ge/graph/passes/reshape_remove_pass.cc @@ -15,7 +15,6 @@ */ #include "graph/passes/reshape_remove_pass.h" - #include "graph/passes/pass_utils.h" namespace ge { diff --git a/src/ge/graph/passes/resource_pair_add_control_pass.cc b/src/ge/graph/passes/resource_pair_add_control_pass.cc index 7c896867..c5be9600 100644 --- a/src/ge/graph/passes/resource_pair_add_control_pass.cc +++ b/src/ge/graph/passes/resource_pair_add_control_pass.cc @@ -20,7 +20,6 @@ #include #include #include - #include "framework/common/debug/ge_log.h" #include "common/ge_inner_error_codes.h" #include "common/types.h" diff --git a/src/ge/graph/passes/resource_pair_remove_control_pass.cc b/src/ge/graph/passes/resource_pair_remove_control_pass.cc index 2bcb7db1..de3537f0 100644 --- a/src/ge/graph/passes/resource_pair_remove_control_pass.cc +++ b/src/ge/graph/passes/resource_pair_remove_control_pass.cc @@ -20,7 +20,6 @@ #include #include #include - #include "framework/common/debug/ge_log.h" #include "common/ge_inner_error_codes.h" #include "common/types.h" diff --git a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc index 59e32e10..638bfb06 100644 --- a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc +++ b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.cc @@ -15,13 +15,11 @@ */ #include "graph/passes/same_transdata_breadth_fusion_pass.h" - #include #include #include #include #include - #include "common/ge_inner_error_codes.h" #include "common/types.h" #include "framework/common/debug/ge_log.h" @@ -126,13 +124,13 @@ void SameTransdataBreadthFusionPass::GetSameTransdataNode(vector &same_tran auto node_for_compare = node_for_compare_in_anchor->GetOwnerNode(); auto op_desc_for_compare = node_for_compare->GetOpDesc(); GE_CHECK_NOTNULL_JUST_RETURN(op_desc_for_compare); - bool op_compare_label = op_desc_for_compare->HasAttr(ATTR_NAME_STREAM_LABEL); + string op_compare_stream_label; + (void)AttrUtils::GetStr(op_desc_for_compare, ATTR_NAME_STREAM_LABEL, op_compare_stream_label); auto input_desc_for_compare = op_desc_for_compare->GetInputDescPtr(node_for_compare_in_anchor->GetIdx()); GE_CHECK_NOTNULL_JUST_RETURN(input_desc_for_compare); auto output_desc_for_compare = op_desc_for_compare->GetOutputDescPtr(0); GE_CHECK_NOTNULL_JUST_RETURN(output_desc_for_compare); iter = all_transdata_nodes_.erase(iter); - bool op_tmp_label = false; while (iter != all_transdata_nodes_.end()) { auto in_anchor = iter->second; if (in_anchor == nullptr) { @@ -148,11 +146,13 @@ void SameTransdataBreadthFusionPass::GetSameTransdataNode(vector &same_tran GE_CHECK_NOTNULL_JUST_RETURN(op_desc_tmp); auto input_desc_tmp = op_desc_tmp->GetInputDescPtr(in_anchor->GetIdx()); auto output_desc_tmp = op_desc_tmp->GetOutputDescPtr(0); - op_tmp_label = op_desc_tmp->HasAttr(ATTR_NAME_STREAM_LABEL); + string op_tmp_stream_label; + (void)AttrUtils::GetStr(op_desc_tmp, ATTR_NAME_STREAM_LABEL, op_tmp_stream_label); GE_CHECK_NOTNULL_JUST_RETURN(input_desc_tmp); GE_CHECK_NOTNULL_JUST_RETURN(output_desc_tmp); - if ((op_compare_label == op_tmp_label) && (input_desc_tmp->GetFormat() == input_desc_for_compare->GetFormat()) && + if ((op_compare_stream_label == op_tmp_stream_label) && + (input_desc_tmp->GetFormat() == input_desc_for_compare->GetFormat()) && (output_desc_tmp->GetFormat() == output_desc_for_compare->GetFormat())) { GELOGD("same transdata node:%s, src node:%s", node_tmp->GetName().c_str(), node_for_compare->GetName().c_str()); InsertSameTransdataNodeIndex(iter->first, same_transdata_nodes); diff --git a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.h b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.h index 5d8d09b4..f4b44a59 100644 --- a/src/ge/graph/passes/same_transdata_breadth_fusion_pass.h +++ b/src/ge/graph/passes/same_transdata_breadth_fusion_pass.h @@ -19,7 +19,6 @@ #include #include - #include "inc/graph_pass.h" namespace ge { diff --git a/src/ge/graph/passes/save_pass.cc b/src/ge/graph/passes/save_pass.cc index 92e2af8d..49196206 100644 --- a/src/ge/graph/passes/save_pass.cc +++ b/src/ge/graph/passes/save_pass.cc @@ -19,7 +19,6 @@ #include #include #include - #include "framework/common/debug/ge_log.h" #include "common/ge_inner_error_codes.h" #include "graph/utils/graph_utils.h" @@ -48,7 +47,8 @@ Status SavePass::Run(ge::ComputeGraphPtr graph) { out_index.emplace_back(out_anchor->GetIdx()); ge::OpDescPtr op_desc = peer_node->GetOpDesc(); GE_IF_BOOL_EXEC(!ge::AttrUtils::SetStr(op_desc, kVarAttrVarIsSave, kVarIsSave), - GELOGE(INTERNAL_ERROR, "get kVarAttrVarIsSave failed"); return INTERNAL_ERROR); + GELOGE(INTERNAL_ERROR, "get kVarAttrVarIsSave failed"); + return INTERNAL_ERROR); } } } diff --git a/src/ge/graph/passes/snapshot_pass.cc b/src/ge/graph/passes/snapshot_pass.cc index 2b8577b9..702cf4de 100644 --- a/src/ge/graph/passes/snapshot_pass.cc +++ b/src/ge/graph/passes/snapshot_pass.cc @@ -15,9 +15,7 @@ */ #include "graph/passes/snapshot_pass.h" - #include - #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "graph/common/omg_util.h" @@ -31,7 +29,7 @@ Status SnapshotPass::Run(NodePtr &node) { string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { - GELOGE(status_ret, "SnapshotPass get original type fail."); + GELOGE(status_ret, "SnapshotPass get original type failed."); return status_ret; } if (type == SNAPSHOT) { diff --git a/src/ge/graph/passes/stop_gradient_pass.cc b/src/ge/graph/passes/stop_gradient_pass.cc index 680fbbba..bd5c0ea8 100644 --- a/src/ge/graph/passes/stop_gradient_pass.cc +++ b/src/ge/graph/passes/stop_gradient_pass.cc @@ -15,7 +15,6 @@ */ #include "graph/passes/stop_gradient_pass.h" - #include namespace ge { @@ -27,7 +26,7 @@ Status StopGradientPass::Run(NodePtr &node) { string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { - GELOGE(status_ret, "StopGradientPass get original type fail."); + GELOGE(status_ret, "StopGradientPass get original type failed."); return status_ret; } diff --git a/src/ge/graph/passes/switch_logic_remove_pass.cc b/src/ge/graph/passes/switch_logic_remove_pass.cc index f3d72aad..be84a582 100644 --- a/src/ge/graph/passes/switch_logic_remove_pass.cc +++ b/src/ge/graph/passes/switch_logic_remove_pass.cc @@ -15,11 +15,9 @@ */ #include "graph/passes/switch_logic_remove_pass.h" - #include #include #include - #include "framework/common/debug/ge_log.h" #include "graph/utils/graph_utils.h" #include "graph/passes/pass_utils.h" @@ -39,9 +37,7 @@ char const *GetOutputNameFromIndex(int index) { return "UNKNOWN"; } -inline bool IsSwitch(const std::string &type) { - return type == SWITCH || type == REFSWITCH; -} +inline bool IsSwitch(const std::string &type) { return type == SWITCH || type == REFSWITCH; } Status GetPredNode(const NodePtr &switch_node, PredNodeAndOut &pred_node_index) { GE_CHECK_NOTNULL(switch_node); @@ -52,16 +48,13 @@ Status GetPredNode(const NodePtr &switch_node, PredNodeAndOut &pred_node_index) } auto pred_node_anchor = pred_in_anchor->GetPeerOutAnchor(); if (pred_node_anchor == nullptr) { - GELOGE(INTERNAL_ERROR, - "Failed to get pred node for switch %s, node peer out anchor", + GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, node peer out anchor", switch_node->GetName().c_str()); return INTERNAL_ERROR; } auto pred_node = pred_node_anchor->GetOwnerNode(); if (pred_node == nullptr) { - GELOGE(INTERNAL_ERROR, - "Failed to get pred node for switch %s, null node", - switch_node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, null node", switch_node->GetName().c_str()); return INTERNAL_ERROR; } pred_node_index.first = pred_node; @@ -113,8 +106,8 @@ Status SwitchLogicRemovePass::Run(NodePtr &node) { continue; } GELOGI("The switch nodes cascaded %s and %s have the save pred node %s, the %s can be remove", - node->GetName().c_str(), dst_node->GetName().c_str(), - pred_node_and_out.first->GetName().c_str(), dst_node->GetName().c_str()); + node->GetName().c_str(), dst_node->GetName().c_str(), pred_node_and_out.first->GetName().c_str(), + dst_node->GetName().c_str()); ret = RemoveSwitchNodeLogically(i, dst_node); if (ret != SUCCESS) { return ret; @@ -139,8 +132,8 @@ Status SwitchLogicRemovePass::RemoveSwitchNodeLogically(int parent_index, NodePt continue; } - GELOGI("Remove inactivate branch %s(%d) from switch %s", - GetOutputNameFromIndex(i), i, switch_node->GetName().c_str()); + GELOGI("Remove inactivate branch %s(%d) from switch %s", GetOutputNameFromIndex(i), i, + switch_node->GetName().c_str()); std::vector deleted_nodes; std::vector end_nodes; auto ret = PassUtils::RemoveInactiveBranchToMerge(out_anchor, deleted_nodes, end_nodes); @@ -150,20 +143,18 @@ Status SwitchLogicRemovePass::RemoveSwitchNodeLogically(int parent_index, NodePt for (auto &node : deleted_nodes) { GE_CHECK_NOTNULL(node); - GELOGD("Remove node %s from inactivate branch from switch %s", - node->GetName().c_str(), switch_node->GetName().c_str()); + GELOGD("Remove node %s from inactivate branch from switch %s", node->GetName().c_str(), + switch_node->GetName().c_str()); AddNodeDeleted(node.get()); } for (auto &node : end_nodes) { GE_CHECK_NOTNULL(node); - GELOGD("Add end node %s to re-pass list, for inactivate branch from switch %s", - node->GetName().c_str(), switch_node->GetName().c_str()); + GELOGD("Add end node %s to re-pass list, for inactivate branch from switch %s", node->GetName().c_str(), + switch_node->GetName().c_str()); AddRePassNode(node); } } - GELOGI("Remove switch node cascaded %s, replace out index %d", - switch_node->GetName().c_str(), parent_index); + GELOGI("Remove switch node cascaded %s, replace out index %d", switch_node->GetName().c_str(), parent_index); return IsolateAndDeleteNode(switch_node, isolate_map); } } // namespace ge - diff --git a/src/ge/graph/passes/switch_logic_remove_pass.h b/src/ge/graph/passes/switch_logic_remove_pass.h index 80f4eae4..b711cc73 100644 --- a/src/ge/graph/passes/switch_logic_remove_pass.h +++ b/src/ge/graph/passes/switch_logic_remove_pass.h @@ -16,13 +16,13 @@ #ifndef GE_GRAPH_PASSES_SWITCH_LOGIC_REMOVE_PASS_H_ #define GE_GRAPH_PASSES_SWITCH_LOGIC_REMOVE_PASS_H_ - #include "graph/passes/base_pass.h" namespace ge { class SwitchLogicRemovePass : public BaseNodePass { public: Status Run(NodePtr &node) override; + private: Status RemoveSwitchNodeLogically(int parent_index, NodePtr &switch_node); }; diff --git a/src/ge/graph/passes/switch_op_pass.cc b/src/ge/graph/passes/switch_op_pass.cc index 6aa61352..5ed1cb1c 100644 --- a/src/ge/graph/passes/switch_op_pass.cc +++ b/src/ge/graph/passes/switch_op_pass.cc @@ -15,7 +15,6 @@ */ #include "graph/passes/switch_op_pass.h" - #include #include #include @@ -23,7 +22,6 @@ #include #include #include - #include "common/ge/ge_util.h" #include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" diff --git a/src/ge/graph/passes/switch_op_pass.h b/src/ge/graph/passes/switch_op_pass.h index 2bb1adf0..14cdd22c 100644 --- a/src/ge/graph/passes/switch_op_pass.h +++ b/src/ge/graph/passes/switch_op_pass.h @@ -23,7 +23,6 @@ #include #include #include - #include "inc/graph_pass.h" namespace ge { @@ -124,8 +123,8 @@ class SwitchOpPass : public GraphPass { Status UpdateCondBranch(NodePtr &node); - Status UpdateAttachFlag(const NodePtr &node, std::string &stream_label, - bool &merge_flag, bool &exit_flag, bool &net_output_flag); + Status UpdateAttachFlag(const NodePtr &node, std::string &stream_label, bool &merge_flag, bool &exit_flag, + bool &net_output_flag); Status UpdateLoopBranch(const std::stack &enter_nodes, const std::string &stream_label); diff --git a/src/ge/graph/passes/switch_pass.cc b/src/ge/graph/passes/switch_pass.cc index c8565a74..8230d294 100644 --- a/src/ge/graph/passes/switch_pass.cc +++ b/src/ge/graph/passes/switch_pass.cc @@ -18,7 +18,6 @@ #include #include - #include "framework/common/debug/ge_log.h" #include "common/ge_inner_error_codes.h" #include "common/types.h" @@ -120,12 +119,12 @@ Status SwitchPass::DeleteSwitchNode(NodePtr &node, NodePtr &pred_node, const Out Status SwitchPass::Run(NodePtr &node) { GELOGD("SwitchPass running"); if (node == nullptr) { - GELOGE(PARAM_INVALID, "param [node] must not be null."); + GELOGE(PARAM_INVALID, "Param [node] must not be null."); return PARAM_INVALID; } std::string op_type; - GE_CHK_STATUS_RET(GetOriginalType(node, op_type), "get original type failed"); + GE_CHK_STATUS_RET(GetOriginalType(node, op_type), "Get original type failed"); if ((op_type != SWITCH) && (op_type != REFSWITCH)) { return SUCCESS; } diff --git a/src/ge/graph/passes/transop_breadth_fusion_pass.cc b/src/ge/graph/passes/transop_breadth_fusion_pass.cc index 444ae979..bcf7e72f 100644 --- a/src/ge/graph/passes/transop_breadth_fusion_pass.cc +++ b/src/ge/graph/passes/transop_breadth_fusion_pass.cc @@ -26,6 +26,7 @@ namespace ge { Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) { + GE_TIMESTAMP_START(TransOpBreadthFusionPass); if (graph == nullptr) { return SUCCESS; } @@ -36,9 +37,9 @@ Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) { for (auto const &id_to_trans_nodes : ids_to_trans_nodes) { if (id_to_trans_nodes.second.size() > 1) { GELOGI( - "Begin to breath fusion output trans-op-nodes for %s, " - "trans id %s, trans-op count %zu", - node->GetName().c_str(), id_to_trans_nodes.first.c_str(), id_to_trans_nodes.second.size()); + "Begin to breath fusion output trans-op-nodes for %s, " + "trans id %s, trans-op count %zu", + node->GetName().c_str(), id_to_trans_nodes.first.c_str(), id_to_trans_nodes.second.size()); graphStatus status = Fusion(id_to_trans_nodes.second, graph); if (status != GRAPH_SUCCESS) { return FAILED; @@ -46,6 +47,7 @@ Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) { } } } + GE_TIMESTAMP_END(TransOpBreadthFusionPass, "GraphManager::TransOpBreadthFusionPass"); return SUCCESS; } @@ -76,35 +78,40 @@ std::string TransOpBreadthFusionPass::GetNodeId(const int anchor_index, const No GELOGD("Get stream label %s for node %s, add it to fusion id", stream_label.c_str(), node->GetName().c_str()); id << '-' << stream_label; } + // [Cascade pointer] + const auto &input_desc = node->GetOpDesc()->MutableInputDesc(0); + const auto &output_desc = node->GetOpDesc()->MutableOutputDesc(0); + GE_CHECK_NOTNULL_EXEC(input_desc, return ""); + GE_CHECK_NOTNULL_EXEC(output_desc, return ""); if (trans_data_type) { id << '-'; - id << static_cast(node->GetOpDesc()->GetInputDesc(0).GetDataType()); + id << static_cast(input_desc->GetDataType()); id << '-'; - id << static_cast(node->GetOpDesc()->GetOutputDesc(0).GetDataType()); + id << static_cast(output_desc->GetDataType()); } if (trans_format) { id << '-'; - id << static_cast(node->GetOpDesc()->GetInputDesc(0).GetFormat()); + id << static_cast(input_desc->GetFormat()); id << '-'; - id << static_cast(node->GetOpDesc()->GetOutputDesc(0).GetFormat()); + id << static_cast(output_desc->GetFormat()); } if (trans_shape) { id << '-'; - id << JoinDims(",", node->GetOpDesc()->GetInputDesc(0).GetShape().GetDims()); + id << JoinDims(",", input_desc->GetShape().GetDims()); id << '-'; - id << JoinDims(",", node->GetOpDesc()->GetOutputDesc(0).GetShape().GetDims()); + id << JoinDims(",", output_desc->GetShape().GetDims()); } return id.str(); } -/// -/// Get all transform operators in the output of node. -/// @param node -/// @return std::map -/// key - transform operator identifer -/// value - transform operator set -/// +/** + * Get all transform operators in the output of node. + * @param node + * @return std::map + * key - transform operator identifer + * value - transform operator set + */ std::map> TransOpBreadthFusionPass::GetOutputTransOpNodes(const NodePtr &node) { auto result = std::map>(); if (node == nullptr) { @@ -134,13 +141,13 @@ std::map> TransOpBreadthFusionPass::GetOutputT return result; } -/// -/// Reserving Transform operators which with smaller topo index, -/// other transform operators's output edges merge to the reserved transform operator. -/// Removed transform operators have no output edges. -/// @param trans_nodes -/// @param graph -/// +/** + * Reserving Transform operators which with smaller topo index, + * other transform operators's output edges merge to the reserved transform operator. + * Removed transform operators have no output edges. + * @param trans_nodes + * @param graph + */ graphStatus TransOpBreadthFusionPass::Fusion(const std::vector &trans_nodes, ComputeGraphPtr &graph) { if (trans_nodes.empty()) { return GRAPH_FAILED; diff --git a/src/ge/graph/passes/transop_depth_fusion_pass.cc b/src/ge/graph/passes/transop_depth_fusion_pass.cc index ad98369f..da16ddbd 100644 --- a/src/ge/graph/passes/transop_depth_fusion_pass.cc +++ b/src/ge/graph/passes/transop_depth_fusion_pass.cc @@ -24,16 +24,18 @@ #include "graph/ge_tensor.h" #include "graph/op_desc.h" #include "graph/utils/graph_utils.h" +#include "graph/common/transop_util.h" namespace ge { graphStatus TransOpDepthFusionPass::Run(ComputeGraphPtr graph) { + GE_TIMESTAMP_START(TransOpDepthFusionPass); GELOGI("[TransOpDepthFusionPass]: optimize in depth begin..."); if (graph == nullptr) { return GRAPH_SUCCESS; } for (const auto &node : graph->GetAllNodes()) { GE_CHECK_NOTNULL(node); - if (IsTransOp(node)) { + if (TransOpUtil::IsTransOp(node)) { continue; } GELOGD("Current normal node is: %s, type: %s, begin in-depth recursive", node->GetName().c_str(), @@ -49,6 +51,7 @@ graphStatus TransOpDepthFusionPass::Run(ComputeGraphPtr graph) { } } GELOGI("[TransOpDepthFusionPass]: Optimize in depth success..."); + GE_TIMESTAMP_END(TransOpDepthFusionPass, "GraphManager::TransOpDepthFusionPass"); return GRAPH_SUCCESS; } @@ -84,12 +87,12 @@ graphStatus TransOpDepthFusionPass::RecursiveInDepth(const InDataAnchorPtr &dst_ return GRAPH_FAILED; } auto node = dst_in_anchor->GetOwnerNode(); - if (!IsTransOp(node)) { + if (!TransOpUtil::IsTransOp(node) || dst_in_anchor->GetIdx() != TransOpUtil::GetTransOpDataIndex(node)) { GELOGD("Now the end of this branch, node: %s, type: %s, recursive depth: %u", node->GetName().c_str(), node->GetType().c_str(), temp_depth); temp_depth--; return GRAPH_SUCCESS; - } else if (node->GetType() == RESHAPE || node->GetType() == REFORMAT) { + } else if (CheckNodeCanBeDeleted(node)) { GELOGD("node: %s, type: %s does not change memory, just delete", node->GetName().c_str(), node->GetType().c_str()); auto out_anchor = node->GetOutDataAnchor(0); @@ -160,12 +163,9 @@ graphStatus TransOpDepthFusionPass::RecursiveInDepth(const InDataAnchorPtr &dst_ return GRAPH_SUCCESS; } -bool TransOpDepthFusionPass::IsTransOp(const NodePtr &node) { - if (node == nullptr) { - return false; - } - return node->GetType() == CAST || node->GetType() == RESHAPE || node->GetType() == REFORMAT || - node->GetType() == TRANSPOSE || node->GetType() == TRANSPOSED || node->GetType() == TRANSDATA; +bool TransOpDepthFusionPass::CheckNodeCanBeDeleted(const NodePtr &node) { + return node->GetType() == RESHAPE || node->GetType() == REFORMAT || node->GetType() == SQUEEZE || + node->GetType() == EXPANDDIMS; } bool TransOpDepthFusionPass::DescAreSymmetry(const NodePtr &src_node, const NodePtr &dst_node) { @@ -173,14 +173,16 @@ bool TransOpDepthFusionPass::DescAreSymmetry(const NodePtr &src_node, const Node dst_node->GetOpDesc() == nullptr) { return false; } - auto src_input_desc = src_node->GetOpDesc()->GetInputDesc(0); - auto dst_output_desc = dst_node->GetOpDesc()->GetOutputDesc(0); - auto src_input_dtype = src_input_desc.GetDataType(); - auto src_input_format = src_input_desc.GetFormat(); - auto src_input_shape = src_input_desc.GetShape().GetDims(); - auto dst_output_dtype = dst_output_desc.GetDataType(); - auto dst_output_format = dst_output_desc.GetFormat(); - auto dst_output_shape = dst_output_desc.GetShape().GetDims(); + const auto &src_input_desc = src_node->GetOpDesc()->MutableInputDesc(0); + const auto &dst_output_desc = dst_node->GetOpDesc()->MutableOutputDesc(0); + GE_CHECK_NOTNULL_EXEC(src_input_desc, return false); + GE_CHECK_NOTNULL_EXEC(dst_output_desc, return false); + const auto &src_input_dtype = src_input_desc->GetDataType(); + const auto &src_input_format = src_input_desc->GetFormat(); + const auto &src_input_shape = src_input_desc->GetShape().GetDims(); + const auto &dst_output_dtype = dst_output_desc->GetDataType(); + const auto &dst_output_format = dst_output_desc->GetFormat(); + const auto &dst_output_shape = dst_output_desc->GetShape().GetDims(); if (src_node->GetType() == CAST && dst_node->GetType() == CAST) { return src_input_dtype == dst_output_dtype && src_input_format == dst_output_format; diff --git a/src/ge/graph/passes/transop_depth_fusion_pass.h b/src/ge/graph/passes/transop_depth_fusion_pass.h index 7188f6c0..cc449893 100644 --- a/src/ge/graph/passes/transop_depth_fusion_pass.h +++ b/src/ge/graph/passes/transop_depth_fusion_pass.h @@ -36,11 +36,11 @@ class TransOpDepthFusionPass : public GraphPass { private: /// - /// judge whether an operator is a transform op or not + /// Judge whether the node can be deleted in depth fusion /// @param node /// @return True or False /// - static bool IsTransOp(const NodePtr &node); + static bool CheckNodeCanBeDeleted(const NodePtr &node); /// /// two transform nodes can be offset only when the front node's input is diff --git a/src/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc b/src/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc index 083c30ea..4b08e956 100644 --- a/src/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc +++ b/src/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc @@ -15,7 +15,6 @@ */ #include "graph/passes/transop_nearby_allreduce_fusion_pass.h" - #include "framework/common/debug/ge_log.h" #include "common/debug/log.h" #include "common/types.h" @@ -56,33 +55,35 @@ bool TransOpNearbyAllreduceFusionPass::IsSymmetricTransOps(const NodePtr &node1, return false; } - auto node1_input_desc = node1->GetOpDesc()->GetInputDesc(0); - auto node1_output_desc = node1->GetOpDesc()->GetOutputDesc(0); + const auto &node1_input_desc = node1->GetOpDesc()->MutableInputDesc(0); + const auto &node1_output_desc = node1->GetOpDesc()->MutableOutputDesc(0); + GE_CHECK_NOTNULL_EXEC(node1_input_desc, return false); + GE_CHECK_NOTNULL_EXEC(node1_output_desc, return false); - auto node2_input_desc = node2->GetOpDesc()->GetInputDesc(0); - auto node2_output_desc = node2->GetOpDesc()->GetOutputDesc(0); + const auto &node2_input_desc = node2->GetOpDesc()->MutableInputDesc(0); + const auto &node2_output_desc = node2->GetOpDesc()->MutableOutputDesc(0); + GE_CHECK_NOTNULL_EXEC(node2_input_desc, return false); + GE_CHECK_NOTNULL_EXEC(node2_output_desc, return false); // two symmetric trans ops should have symmetric input/output datatype - GELOGD("format: nod1_input=%d, nod1_output=%d, nod2_input=%d, nod2_output=%d", - node1_input_desc.GetFormat(), node1_output_desc.GetFormat(), node2_input_desc.GetFormat(), - node2_output_desc.GetFormat()); - if (node1_input_desc.GetFormat() != node2_output_desc.GetFormat() || - node1_output_desc.GetFormat() != node2_input_desc.GetFormat()) { + GELOGD("format: nod1_input=%d, nod1_output=%d, nod2_input=%d, nod2_output=%d", node1_input_desc->GetFormat(), + node1_output_desc->GetFormat(), node2_input_desc->GetFormat(), node2_output_desc->GetFormat()); + if (node1_input_desc->GetFormat() != node2_output_desc->GetFormat() || + node1_output_desc->GetFormat() != node2_input_desc->GetFormat()) { return false; } // two symmetric trans ops should have symmetric input/output format - GELOGD("datatype: nod1_input=%d, nod1_output=%d, nod2_input=%d, nod2_output=%d", - node1_input_desc.GetDataType(), node1_output_desc.GetDataType(), node2_input_desc.GetDataType(), - node2_output_desc.GetDataType()); - if (node1_input_desc.GetDataType() != node2_output_desc.GetDataType() || - node1_output_desc.GetDataType() != node2_input_desc.GetDataType()) { + GELOGD("datatype: nod1_input=%d, nod1_output=%d, nod2_input=%d, nod2_output=%d", node1_input_desc->GetDataType(), + node1_output_desc->GetDataType(), node2_input_desc->GetDataType(), node2_output_desc->GetDataType()); + if (node1_input_desc->GetDataType() != node2_output_desc->GetDataType() || + node1_output_desc->GetDataType() != node2_input_desc->GetDataType()) { return false; } // two symmetric trans ops should have symmetric input/output shape - if (node1_input_desc.GetShape().GetDims() != node2_output_desc.GetShape().GetDims() || - node1_output_desc.GetShape().GetDims() != node2_input_desc.GetShape().GetDims()) { + if (node1_input_desc->GetShape().GetDims() != node2_output_desc->GetShape().GetDims() || + node1_output_desc->GetShape().GetDims() != node2_input_desc->GetShape().GetDims()) { return false; } return true; @@ -133,8 +134,8 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt GELOGI("in_node=%s, out_node=%s", in_node->GetName().c_str(), out_node->GetName().c_str()); if (!IsSymmetricTransOps(in_node, out_node)) { - GELOGD("ignore asymmetric transop %s and %s for node %s", - in_node->GetName().c_str(), out_node->GetName().c_str(), node->GetName().c_str()); + GELOGD("ignore asymmetric transop %s and %s for node %s", in_node->GetName().c_str(), out_node->GetName().c_str(), + node->GetName().c_str()); continue; } @@ -164,8 +165,8 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt if (node->GetOpDesc()->UpdateOutputDesc(static_cast(i), output_desc) != GRAPH_SUCCESS) { GELOGE(FAILED, "UpdateOutputDesc"); } - GELOGI("successfully remove paired transop (%s and %s) for node %s", - in_node->GetName().c_str(), out_node->GetName().c_str(), node->GetName().c_str()); + GELOGI("successfully remove paired transop (%s and %s) for node %s", in_node->GetName().c_str(), + out_node->GetName().c_str(), node->GetName().c_str()); } GELOGI("successfully remove %zu pair of transops in total for node %s", removed_node_count, node->GetName().c_str()); return SUCCESS; diff --git a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc index 66a7278c..b1df8e09 100644 --- a/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc +++ b/src/ge/graph/passes/transop_without_reshape_fusion_pass.cc @@ -15,12 +15,10 @@ */ #include "graph/passes/transop_without_reshape_fusion_pass.h" - #include #include #include #include - #include "common/ge/ge_util.h" #include "common/ge_inner_error_codes.h" #include "common/types.h" @@ -198,8 +196,8 @@ void TransOpWithoutReshapeFusionPass::GetInControlPeerOutControlAnchors( if (peer_node == nullptr) { continue; } - auto findIter = std::find(sub_graph_nodes_[index].begin(), sub_graph_nodes_[index].end(), peer_node); - if (findIter == sub_graph_nodes_[index].end()) { + auto iter = std::find(sub_graph_nodes_[index].begin(), sub_graph_nodes_[index].end(), peer_node); + if (iter == sub_graph_nodes_[index].end()) { in_control_peer_out_control_anchors[index].push_back(peer_out_anchor); } else { sub_graph_has_control_edge_[index] = true; @@ -1064,7 +1062,7 @@ graphStatus TransOpWithoutReshapeFusionPass::GetSubGraphsBetweenNormalNode( continue; } - nodes_list.push_back(make_pair(out_anchor, peer_in_anchor)); + nodes_list.emplace_back(out_anchor, peer_in_anchor); auto peer_in_node = peer_in_anchor->GetOwnerNode(); GE_CHECK_NOTNULL(peer_in_node); if (!IsTransOp(peer_in_node)) { diff --git a/src/ge/graph/passes/transop_without_reshape_fusion_pass.h b/src/ge/graph/passes/transop_without_reshape_fusion_pass.h index 31cd23de..4d037957 100644 --- a/src/ge/graph/passes/transop_without_reshape_fusion_pass.h +++ b/src/ge/graph/passes/transop_without_reshape_fusion_pass.h @@ -17,9 +17,8 @@ #ifndef GE_GRAPH_PASSES_TRANSOP_WITHOUT_RESHAPE_FUSION_PASS_H_ #define GE_GRAPH_PASSES_TRANSOP_WITHOUT_RESHAPE_FUSION_PASS_H_ -#include #include - +#include #include "inc/graph_pass.h" namespace ge { diff --git a/src/ge/graph/passes/unused_const_pass.cc b/src/ge/graph/passes/unused_const_pass.cc index dc8c7c07..386633b5 100644 --- a/src/ge/graph/passes/unused_const_pass.cc +++ b/src/ge/graph/passes/unused_const_pass.cc @@ -15,9 +15,7 @@ */ #include "graph/passes/unused_const_pass.h" - #include - #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" diff --git a/src/ge/graph/passes/unused_op_remove_pass.cc b/src/ge/graph/passes/unused_op_remove_pass.cc index 0b083d4e..093d931a 100644 --- a/src/ge/graph/passes/unused_op_remove_pass.cc +++ b/src/ge/graph/passes/unused_op_remove_pass.cc @@ -15,12 +15,10 @@ */ #include "graph/passes/unused_op_remove_pass.h" - #include #include #include #include - #include "common/debug/log.h" #include "common/op/ge_op_utils.h" #include "common/types.h" @@ -60,6 +58,7 @@ Status UnusedOpRemovePass::Run(ComputeGraphPtr graph) { GE_CHECK_NOTNULL(dst_node->GetOpDesc()); int dst_index = in_anchor->GetIdx(); std::vector list_bool; + GE_CHECK_NOTNULL(dst_node->GetOpDesc()); list_bool = dst_node->GetOpDesc()->GetIsInputConst(); GE_IF_BOOL_EXEC(list_bool.size() == 0, continue); list_bool.erase(list_bool.begin() + dst_index); diff --git a/src/ge/graph/passes/unused_op_remove_pass.h b/src/ge/graph/passes/unused_op_remove_pass.h index 525dfa7e..bbc43af5 100644 --- a/src/ge/graph/passes/unused_op_remove_pass.h +++ b/src/ge/graph/passes/unused_op_remove_pass.h @@ -19,7 +19,6 @@ #include #include - #include "framework/common/ge_types.h" #include "inc/graph_pass.h" diff --git a/src/ge/graph/passes/update_net_output_pass.cc b/src/ge/graph/passes/update_net_output_pass.cc deleted file mode 100644 index bf887115..00000000 --- a/src/ge/graph/passes/update_net_output_pass.cc +++ /dev/null @@ -1,169 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "graph/passes/update_net_output_pass.h" -#include -#include -#include -#include "omg/omg_inner_types.h" -#include "common/util.h" -#include "common/formats/formats.h" -#include "common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.h" -#include "common/formats/format_transfers/format_transfer_nchw_nc1hwc0.h" - -namespace ge { -static std::map kOutputTypeStrToDataType = { - {"FP32", ge::DT_FLOAT}, {"FP16", ge::DT_FLOAT16}, {"INT8", ge::DT_INT8}, {"INT16", ge::DT_INT16}, - {"UINT16", ge::DT_UINT16}, {"UINT8", ge::DT_UINT8}, {"INT32", ge::DT_INT32}, {"INT64", ge::DT_INT64}, - {"UINT32", ge::DT_UINT32}, {"UINT64", ge::DT_UINT64}, {"DOUBLE", ge::DT_DOUBLE}, -}; - -static void SetNetoutputDataType(OpDescPtr &op_desc, uint32_t index, ge::DataType output_data_type) { - // op_desc is judged not nullptr - auto net_output_in_desc = op_desc->MutableInputDesc(index); - if (net_output_in_desc != nullptr) { - net_output_in_desc->SetDataType(output_data_type); - net_output_in_desc->SetOriginDataType(output_data_type); - GELOGI("Update input desc, datatype:%s,", - TypeUtils::DataTypeToSerialString(op_desc->GetInputDesc(0).GetDataType()).c_str()); - } - auto net_output_out_desc = op_desc->MutableOutputDesc(index); - if (net_output_out_desc != nullptr) { - net_output_out_desc->SetDataType(output_data_type); - net_output_out_desc->SetOriginDataType(output_data_type); - GELOGI("Update out desc, datatype:%s", - TypeUtils::DataTypeToSerialString(op_desc->GetOutputDesc(0).GetDataType()).c_str()); - } -} - -static Status SetNetoutputFormat(OpDescPtr op_desc, uint32_t index, ge::Format format) { - // op_desc is judged not nullptr - auto net_output_in_desc = op_desc->MutableInputDesc(index); - GE_CHECK_NOTNULL(net_output_in_desc); - ge::Format old_format = net_output_in_desc->GetFormat(); - bool support = ((old_format == FORMAT_NC1HWC0) || (old_format == FORMAT_NCHW) || (old_format == FORMAT_NHWC)); - if (!support) { - GELOGE(INTERNAL_ERROR, "The node %s format [%s] is unsupported", op_desc->GetName().c_str(), - TypeUtils::FormatToSerialString(old_format).c_str()); - return FAILED; - } - if (old_format == FORMAT_NC1HWC0) { - GELOGI("No need to transfer format"); - return SUCCESS; - } - std::vector old_shape = net_output_in_desc->GetShape().GetDims(); - ge::DataType dt = net_output_in_desc->GetDataType(); - std::vector dst_shape_dims; - if (old_format == FORMAT_NCHW) { - formats::FormatTransferNchwNc1hwc0 transfer; - if (transfer.TransShape(old_format, old_shape, dt, format, dst_shape_dims) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "TransShape failed"); - return FAILED; - } - } - if (old_format == FORMAT_NHWC) { - formats::FormatTransferNhwcNc1hwc0 transfer; - if (transfer.TransShape(old_format, old_shape, dt, format, dst_shape_dims) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "TransShape failed"); - return FAILED; - } - } - net_output_in_desc->SetShape(ge::GeShape(dst_shape_dims)); - net_output_in_desc->SetOriginShape(ge::GeShape(dst_shape_dims)); - net_output_in_desc->SetFormat(format); - net_output_in_desc->SetOriginFormat(format); - GELOGI("Update input desc, format:%s,", - TypeUtils::FormatToSerialString(op_desc->GetInputDesc(0).GetFormat()).c_str()); - - auto net_output_out_desc = op_desc->MutableOutputDesc(index); - if (net_output_out_desc == nullptr) { - GELOGW("The opdesc is nullptr"); - return FAILED; - } - net_output_out_desc->SetShape(ge::GeShape(dst_shape_dims)); - net_output_out_desc->SetOriginShape(ge::GeShape(dst_shape_dims)); - net_output_out_desc->SetFormat(format); - net_output_out_desc->SetOriginFormat(format); - GELOGI("Update out desc, format:%s", TypeUtils::FormatToSerialString(op_desc->GetOutputDesc(0).GetFormat()).c_str()); - return SUCCESS; -} - -Status ReUpdateNetOutputPass::Run(ge::NodePtr &node) { - GELOGD("ReUpdateNetOutputPass running"); - if (node == nullptr) { - GELOGE(FAILED, "parameter is null."); - return FAILED; - } - auto op_desc = node->GetOpDesc(); - if (op_desc == nullptr) { - GELOGE(FAILED, "op_desc is null."); - return FAILED; - } - - std::string op_type = op_desc->GetType(); - if (op_type != NETOUTPUT) { - return SUCCESS; - } - GELOGD("NetOutput start ReUpdateNetOutputPass"); - bool is_set_output_type = false; - ge::DataType output_data_type = ge::DT_FLOAT; - std::string output_type = domi::GetContext().output_type; - if (kOutputTypeStrToDataType.find(output_type) != kOutputTypeStrToDataType.end()) { - output_data_type = kOutputTypeStrToDataType[output_type]; - is_set_output_type = true; - } else { - GELOGW("output_type [%s] set can not find", output_type.c_str()); - is_set_output_type = false; - } - - for (const auto &in_anchor : node->GetAllInDataAnchors()) { - auto index = static_cast(in_anchor->GetIdx()); - // Update datatype - if (is_set_output_type) { - SetNetoutputDataType(op_desc, index, output_data_type); - continue; - } - // output_node is not set,check if is_output_adjust_hw_layout is set - auto peer_out = in_anchor->GetPeerOutAnchor(); - GE_CHECK_NOTNULL(peer_out); - auto own_node = peer_out->GetOwnerNode(); - GE_CHECK_NOTNULL(own_node); - OpDescPtr src_op_desc = own_node->GetOpDesc(); - GE_CHECK_NOTNULL(src_op_desc); - bool set_fp16_nc1hwc0 = false; - if (AttrUtils::GetBool(src_op_desc, "output_set_fp16_nc1hwc0", set_fp16_nc1hwc0)) { - GELOGI("This output [%s] should be set FP16 and NC1HWC0", src_op_desc->GetName().c_str()); - if (set_fp16_nc1hwc0) { - SetNetoutputDataType(op_desc, index, ge::DT_FLOAT16); - if (SetNetoutputFormat(op_desc, index, FORMAT_NC1HWC0) != SUCCESS) { - GELOGE(PARAM_INVALID, "SetNetoutputFormat failed"); - return FAILED; - } - // set the outputdesc originformat NC1HWC0, as partition insert placehold node format is based on originformat - auto src_index = static_cast(in_anchor->GetPeerOutAnchor()->GetIdx()); - auto src_output_desc = src_op_desc->MutableOutputDesc(src_index); - if (src_output_desc == nullptr) { - GELOGE(PARAM_INVALID, "src_output_desc is m=nullptr"); - return FAILED; - } - src_output_desc->SetOriginFormat(FORMAT_NC1HWC0); - } - } - } - GELOGD("node[%s] ReUpdateNetOutputPass done", op_type.c_str()); - return SUCCESS; -} -} // namespace ge diff --git a/src/ge/graph/passes/var_is_initialized_op_pass.h b/src/ge/graph/passes/var_is_initialized_op_pass.h index 83fb421f..37b3f49b 100644 --- a/src/ge/graph/passes/var_is_initialized_op_pass.h +++ b/src/ge/graph/passes/var_is_initialized_op_pass.h @@ -16,12 +16,10 @@ #ifndef GE_GRAPH_PASSES_VAR_IS_INITIALIZED_OP_PASS_H_ #define GE_GRAPH_PASSES_VAR_IS_INITIALIZED_OP_PASS_H_ - #include #include #include #include - #include "graph/passes/base_pass.h" namespace ge { diff --git a/src/ge/graph/passes/variable_format_pass.cc b/src/ge/graph/passes/variable_format_pass.cc index 9b5e284e..28f6a4f7 100644 --- a/src/ge/graph/passes/variable_format_pass.cc +++ b/src/ge/graph/passes/variable_format_pass.cc @@ -15,11 +15,9 @@ */ #include "graph/passes/variable_format_pass.h" - #include #include #include - #include "framework/common/debug/ge_log.h" namespace ge { diff --git a/src/ge/graph/passes/variable_format_pass.h b/src/ge/graph/passes/variable_format_pass.h index 009ae14f..1a0abe2e 100644 --- a/src/ge/graph/passes/variable_format_pass.h +++ b/src/ge/graph/passes/variable_format_pass.h @@ -20,7 +20,6 @@ #include #include #include - #include "graph/types.h" #include "graph/utils/op_desc_utils.h" #include "inc/graph_pass.h" diff --git a/src/ge/graph/passes/variable_op_pass.cc b/src/ge/graph/passes/variable_op_pass.cc index 26bb453a..eb8b5206 100644 --- a/src/ge/graph/passes/variable_op_pass.cc +++ b/src/ge/graph/passes/variable_op_pass.cc @@ -15,19 +15,18 @@ */ #include "graph/passes/variable_op_pass.h" - #include #include -#include "common/formats/formats.h" -#include "common/formats/utils/formats_trans_utils.h" #include "framework/common/debug/ge_log.h" -#include "graph/ge_context.h" #include "graph/graph.h" #include "graph/manager/graph_var_manager.h" #include "graph/utils/graph_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" +#include "common/formats/formats.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "graph/ge_context.h" namespace ge { namespace { @@ -113,6 +112,7 @@ bool IsTransSupport(const TransNodeInfo &trans_info) { } // namespace Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { + GE_TIMESTAMP_START(VariableOpPass); if (graph == nullptr) { GELOGE(INTERNAL_ERROR, "Failed to run variable op pass, null graph"); return INTERNAL_ERROR; @@ -189,6 +189,7 @@ Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { } } + GE_TIMESTAMP_END(VariableOpPass, "GraphManager::VariableOpPass"); return SUCCESS; } @@ -404,10 +405,14 @@ Status VariableOpPass::UpdateVarAndRefOutputFormatInfo(const GeTensorDesc &final if (var_ref_node_description->UpdateInputDesc(0, node_desc) != GRAPH_SUCCESS) { GELOGW("UpdateInputDesc fail."); } + const auto &input_desc = var_ref_node_description->MutableInputDesc(0); + const auto &output_desc = var_ref_node_description->MutableOutputDesc(0); + GE_CHECK_NOTNULL(input_desc); + GE_CHECK_NOTNULL(output_desc); GELOGD("var_ref_node ref is (%s, %s, %zu), var_ref_name is %s.", - TypeUtils::DataTypeToSerialString(var_ref_node_description->GetInputDesc(0).GetDataType()).c_str(), - TypeUtils::FormatToSerialString(var_ref_node_description->GetInputDesc(0).GetFormat()).c_str(), - var_ref_node_description->GetOutputDesc(0).GetShape().GetDims().size(), var_ref_node->GetName().c_str()); + TypeUtils::DataTypeToSerialString(input_desc->GetDataType()).c_str(), + TypeUtils::FormatToSerialString(input_desc->GetFormat()).c_str(), output_desc->GetShape().GetDims().size(), + var_ref_node->GetName().c_str()); } return SUCCESS; @@ -470,25 +475,29 @@ Status VariableOpPass::CheckTransNodeAreInverse(const NodePtr &node_a, const Nod const auto &node_b_op_desc = node_b->GetOpDesc(); GE_CHECK_NOTNULL(node_a_op_desc); GE_CHECK_NOTNULL(node_b_op_desc); - const auto &node_a_out_op_desc = node_a_op_desc->GetOutputDesc(0); - const auto &node_a_in_op_desc = node_a_op_desc->GetInputDesc(0); + const auto &node_a_out_op_desc = node_a_op_desc->MutableOutputDesc(0); + const auto &node_a_in_op_desc = node_a_op_desc->MutableInputDesc(0); + GE_CHECK_NOTNULL(node_a_out_op_desc); + GE_CHECK_NOTNULL(node_a_in_op_desc); - const auto &node_b_out_op_desc = node_b_op_desc->GetOutputDesc(0); - const auto &node_b_in_op_desc = node_b_op_desc->GetInputDesc(0); + const auto &node_b_out_op_desc = node_b_op_desc->MutableOutputDesc(0); + const auto &node_b_in_op_desc = node_b_op_desc->MutableInputDesc(0); + GE_CHECK_NOTNULL(node_b_out_op_desc); + GE_CHECK_NOTNULL(node_b_in_op_desc); is_same = IsOpDescSame(node_a_out_op_desc, node_b_in_op_desc) && IsOpDescSame(node_b_out_op_desc, node_a_in_op_desc); return SUCCESS; } -bool VariableOpPass::IsOpDescSame(const GeTensorDesc &op_desc_a, const GeTensorDesc &op_desc_b) { - const auto format_a = op_desc_a.GetFormat(); - const auto type_a = op_desc_a.GetDataType(); - const auto shape_a = op_desc_a.GetShape(); +bool VariableOpPass::IsOpDescSame(const GeTensorDescPtr &op_desc_a, const GeTensorDescPtr &op_desc_b) { + const auto &format_a = op_desc_a->GetFormat(); + const auto &type_a = op_desc_a->GetDataType(); + const auto &shape_a = op_desc_a->GetShape(); - const auto format_b = op_desc_b.GetFormat(); - const auto type_b = op_desc_b.GetDataType(); - const auto shape_b = op_desc_b.GetShape(); + const auto &format_b = op_desc_b->GetFormat(); + const auto &type_b = op_desc_b->GetDataType(); + const auto &shape_b = op_desc_b->GetShape(); const auto &dims_a = shape_a.GetDims(); const auto &dims_b = shape_b.GetDims(); diff --git a/src/ge/graph/passes/variable_op_pass.h b/src/ge/graph/passes/variable_op_pass.h index f97f8d8a..4e194a0c 100644 --- a/src/ge/graph/passes/variable_op_pass.h +++ b/src/ge/graph/passes/variable_op_pass.h @@ -16,10 +16,8 @@ #ifndef GE_GRAPH_PASSES_VARIABLE_OP_PASS_H_ #define GE_GRAPH_PASSES_VARIABLE_OP_PASS_H_ - #include #include - #include "graph/common/transop_util.h" #include "graph/graph.h" #include "graph/manager/graph_var_manager.h" @@ -54,7 +52,7 @@ class VariableOpPass : public GraphPass { Status CheckVarAndVarRefAreAlike(const NodePtr &var_node, const NodePtr &var_ref_node, bool &is_var_and_var_ref_alike); - bool IsOpDescSame(const GeTensorDesc &op_desc_a, const GeTensorDesc &op_desc_b); + bool IsOpDescSame(const GeTensorDescPtr &op_desc_a, const GeTensorDescPtr &op_desc_b); Status CheckTransNodeAreInverse(const NodePtr &node_a, const NodePtr &node_b, bool &is_trans_node_inverse); diff --git a/src/ge/graph/passes/variable_prepare_op_pass.cc b/src/ge/graph/passes/variable_prepare_op_pass.cc index ca9e1c0a..c4ed0405 100644 --- a/src/ge/graph/passes/variable_prepare_op_pass.cc +++ b/src/ge/graph/passes/variable_prepare_op_pass.cc @@ -15,13 +15,10 @@ */ #include "graph/passes/variable_prepare_op_pass.h" - #include #include #include #include - -#include "framework/common/debug/ge_log.h" #include "common/ge/ge_util.h" #include "external/graph/graph.h" #include "framework/common/debug/ge_log.h" @@ -32,13 +29,11 @@ namespace ge { Status VariablePrepareOpPass::Run(ComputeGraphPtr graph) { GE_CHECK_NOTNULL(graph); - for (auto &node : graph->GetDirectNode()) { - GELOGD("before VariablePrepareOpPass, graph has node: %s, and node name: %s", node->GetType().c_str(), - node->GetName().c_str()); - } - for (const auto &node : graph->GetDirectNode()) { - GenerateRefTypeAndInputOutputMap(node); + auto iter = ref_input_output_map_.find(node->GetType()); + if (iter == ref_input_output_map_.end()) { + GenerateRefTypeAndInputOutputMap(node); + } } if (ref_input_output_map_.empty()) { @@ -48,17 +43,23 @@ Status VariablePrepareOpPass::Run(ComputeGraphPtr graph) { for (auto &node : graph->GetDirectNode()) { GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, continue); - GE_IF_BOOL_EXEC(node->GetOpDesc()->GetType() != VARIABLE, continue); - Status ret = DealVariableNode(node); - if (ret != SUCCESS) { - GELOGE(ret, "variable add back edge failed"); - return FAILED; + bool is_variable = node->GetOpDesc()->GetType() == VARIABLE; + bool is_deal = has_dealed_variable_.find(node->GetName()) == has_dealed_variable_.end(); + if (is_variable && is_deal) { + Status ret = DealVariableNode(node); + if (ret != SUCCESS) { + GELOGE(ret, "variable add back edge failed"); + return FAILED; + } } } - for (auto &node : graph->GetDirectNode()) { - GELOGD("after VariablePrepareOpPass, graph has node: %s, and node name: %s", node->GetType().c_str(), - node->GetName().c_str()); + for (auto iter = ref_input_output_map_.begin(); iter != ref_input_output_map_.end(); ++iter) { + GELOGI("ref type:[ %s ]", iter->first.c_str()); + auto index_map = iter->second; + for (auto index_iter = index_map.begin(); index_iter != index_map.end(); ++index_iter) { + GELOGI("{ %d:%d }", index_iter->first, index_iter->second); + } } return SUCCESS; @@ -203,7 +204,8 @@ ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_ return nullptr; } - OpDescPtr var_ref_op_desc = MakeShared(var_node->GetName() + var_ref_name.str(), var_op_desc->GetType()); + OpDescPtr var_ref_op_desc = + MakeShared(var_node->GetName() + var_ref_name.str().c_str(), var_op_desc->GetType()); if (var_ref_op_desc == nullptr) { GELOGE(FAILED, "var_ref opdesc is nullptr"); return nullptr; @@ -217,6 +219,7 @@ ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_ return nullptr); NodePtr var_ref_node = var_node->GetOwnerComputeGraph()->AddNode(var_ref_op_desc); GE_IF_BOOL_EXEC(var_ref_node == nullptr, GELOGW("var_ref_node is null"); return nullptr); + has_dealed_variable_.insert(var_node->GetName()); bool is_set_str = ge::AttrUtils::SetStr(var_ref_op_desc, REF_VAR_SRC_VAR_NAME, var_op_desc->GetName()); if (is_set_str) { @@ -250,34 +253,30 @@ int VariablePrepareOpPass::GetWritableNodeOutIndex(const NodePtr &node, int inpu } void VariablePrepareOpPass::GenerateRefTypeAndInputOutputMap(const NodePtr &node) { - auto out_op_desc = node->GetOpDesc(); - map input_name_index; - for (const auto &input_name : out_op_desc->GetAllInputNames()) { - int index = out_op_desc->GetInputIndexByName(input_name); - input_name_index.emplace(input_name, index); + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + GELOGW("op_desc in null, please check node:[%s]", node->GetName().c_str()); + return; } + for (const auto &out_ancohor : node->GetAllOutDataAnchors()) { + int output_index = out_ancohor->GetIdx(); + string output_name = op_desc->GetOutputNameByIndex(output_index); + GELOGD("output name:[%s]", output_name.c_str()); - for (auto &out_data_anchor : node->GetAllOutDataAnchors()) { - string out_data_anchor_name = out_op_desc->GetOutputNameByIndex(out_data_anchor->GetIdx()); - auto iter = input_name_index.find(out_data_anchor_name); - if (iter != input_name_index.end()) { - GELOGD("From input_name_index_map find corresponding output name and out index : [ %s : %d]", - out_data_anchor_name.c_str(), out_data_anchor->GetIdx()); - auto ref_type_iter = ref_input_output_map_.find(node->GetType()); - if (ref_type_iter != ref_input_output_map_.end()) { - GELOGD("From ref_input_output_map_ find already existed ref_type_iter. Type : [%s]", - ref_type_iter->first.c_str()); - auto input_output_iter = ref_type_iter->second.find(iter->second); - if (input_output_iter != ref_type_iter->second.end()) { - ref_type_iter->second.emplace(iter->second, out_data_anchor->GetIdx()); - GELOGI("Add RefInputOutputMap [ %s ] : {%d, %d}", node->GetType().c_str(), iter->second, - out_data_anchor->GetIdx()); - } - } else { - ref_input_output_map_.insert({node->GetType(), {{iter->second, out_data_anchor->GetIdx()}}}); - GELOGI("Create RefInputOutputMap { %s : {%d, %d}}", node->GetType().c_str(), iter->second, - out_data_anchor->GetIdx()); + int input_index = op_desc->GetInputIndexByName(output_name); + if (input_index == -1) { + continue; + } + auto ref_type_and_input_output_iter = ref_input_output_map_.find(node->GetType()); + if (ref_type_and_input_output_iter != ref_input_output_map_.end()) { + auto input_output_index_map = ref_type_and_input_output_iter->second; + if (input_output_index_map.find(input_index) == input_output_index_map.end()) { + input_output_index_map.emplace(input_index, output_index); + GELOGD("Add RefInputOutputMap %s:{ %d, %d }", node->GetType().c_str(), input_index, output_index); } + } else { + ref_input_output_map_.insert({node->GetType(), {{input_index, output_index}}}); + GELOGD("Create RefInputOutputMap { %s:{ %d, %d } }", node->GetType().c_str(), input_index, output_index); } } } diff --git a/src/ge/graph/passes/variable_prepare_op_pass.h b/src/ge/graph/passes/variable_prepare_op_pass.h index 738faa10..0fbd311c 100644 --- a/src/ge/graph/passes/variable_prepare_op_pass.h +++ b/src/ge/graph/passes/variable_prepare_op_pass.h @@ -39,6 +39,7 @@ class VariablePrepareOpPass : public GraphPass { void GenerateRefTypeAndInputOutputMap(const NodePtr &node); std::map> ref_input_output_map_; + std::unordered_set has_dealed_variable_{}; }; } // namespace ge diff --git a/src/ge/graph/passes/variable_ref_delete_op_pass.cc b/src/ge/graph/passes/variable_ref_delete_op_pass.cc index 5ff01a94..7bc767ee 100644 --- a/src/ge/graph/passes/variable_ref_delete_op_pass.cc +++ b/src/ge/graph/passes/variable_ref_delete_op_pass.cc @@ -15,13 +15,12 @@ */ #include "graph/passes/variable_ref_delete_op_pass.h" - #include - #include "framework/common/debug/ge_log.h" namespace ge { Status VariableRefDeleteOpPass::Run(ge::ComputeGraphPtr graph) { + GE_TIMESTAMP_START(VariableRefDeleteOpPass); GE_CHECK_NOTNULL(graph); for (auto &node : graph->GetDirectNode()) { @@ -48,6 +47,7 @@ Status VariableRefDeleteOpPass::Run(ge::ComputeGraphPtr graph) { GELOGD("after VariableRefDeleteOpPass, graph has node: %s, and node name: %s", node->GetType().c_str(), node->GetName().c_str()); } + GE_TIMESTAMP_END(VariableRefDeleteOpPass, "GraphManager::VariableRefDeleteOpPass"); return SUCCESS; } @@ -94,8 +94,8 @@ Status VariableRefDeleteOpPass::DealVariableRef(ge::ComputeGraphPtr &graph, ge:: GE_CHECK_NOTNULL(var_ref_src_var->GetOpDesc()); bool is_set_index = ge::AttrUtils::SetInt(var_ref_src_var->GetOpDesc(), REF_VAR_PRE_PEER_OUT_INDEX, index); if (is_set_str && is_set_index) { - GELOGD("[%s]: add attr [REF_VAR_SRC_VAR_NAME: %s ] ", peer_node->GetName().c_str(), ref_var_src_var_name.c_str()); - GELOGD("[%s]: add attr [ REF_VAR_PRE_PEER_OUT_INDEX: %d ]", var_ref_src_var->GetName().c_str(), index); + GELOGI("[%s]: add attr [REF_VAR_SRC_VAR_NAME: %s ] ", peer_node->GetName().c_str(), ref_var_src_var_name.c_str()); + GELOGI("[%s]: add attr [ REF_VAR_PRE_PEER_OUT_INDEX: %d ]", var_ref_src_var->GetName().c_str(), index); } return SUCCESS; diff --git a/src/ge/graph/passes/variable_ref_delete_op_pass.h b/src/ge/graph/passes/variable_ref_delete_op_pass.h index e481f9f8..43db2703 100644 --- a/src/ge/graph/passes/variable_ref_delete_op_pass.h +++ b/src/ge/graph/passes/variable_ref_delete_op_pass.h @@ -18,7 +18,6 @@ #define GE_GRAPH_PASSES_VARIABLE_REF_DELETE_OP_PASS_H_ #include - #include "framework/common/ge_inner_error_codes.h" #include "inc/graph_pass.h" @@ -28,8 +27,7 @@ class VariableRefDeleteOpPass : public GraphPass { Status Run(ge::ComputeGraphPtr graph); private: - Status DealVariableRef(ge::ComputeGraphPtr &graph, - ge::NodePtr &variable_ref, + Status DealVariableRef(ge::ComputeGraphPtr &graph, ge::NodePtr &variable_ref, const std::string &ref_var_src_var_name); }; } // namespace ge diff --git a/src/ge/graph/preprocess/graph_preprocess.cc b/src/ge/graph/preprocess/graph_preprocess.cc index c3066ea7..a33bc8cc 100644 --- a/src/ge/graph/preprocess/graph_preprocess.cc +++ b/src/ge/graph/preprocess/graph_preprocess.cc @@ -18,10 +18,13 @@ #include #include #include +#include +#include "common/formats/format_transfers/format_transfer_nchw_nc1hwc0.h" +#include "common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.h" +#include "common/helper/model_helper.h" #include "common/math/math_util.h" -#include "framework/common/debug/ge_log.h" #include "common/op/ge_op_utils.h" -#include "common/helper/model_helper.h" +#include "framework/common/debug/ge_log.h" #include "graph/common/transop_util.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" @@ -33,6 +36,8 @@ #include "graph/passes/assert_pass.h" #include "graph/passes/base_pass.h" #include "graph/passes/constant_folding_pass.h" +#include "graph/passes/constant_fuse_same_pass.h" +#include "graph/passes/control_trigger_pass.h" #include "graph/passes/dimension_adjust_pass.h" #include "graph/passes/dimension_compute_pass.h" #include "graph/passes/dropout_pass.h" @@ -48,7 +53,6 @@ #include "graph/passes/merge_pass.h" #include "graph/passes/net_output_pass.h" #include "graph/passes/next_iteration_pass.h" -#include "graph/passes/control_trigger_pass.h" #include "graph/passes/no_use_reshape_remove_pass.h" #include "graph/passes/placeholder_with_default_pass.h" #include "graph/passes/prevent_gradient_pass.h" @@ -65,22 +69,27 @@ #include "graph/passes/switch_pass.h" #include "graph/passes/unused_const_pass.h" #include "graph/passes/unused_op_remove_pass.h" -#include "graph/passes/update_net_output_pass.h" #include "graph/passes/var_is_initialized_op_pass.h" #include "graph/passes/variable_prepare_op_pass.h" -#include "graph/passes/constant_fuse_same_pass.h" +#include "graph/passes/common_subexpression_elimination_pass.h" +#include "graph/passes/replace_with_empty_const_pass.h" #include "graph/preprocess/insert_op/util_insert_aipp_op.h" #include "graph/types.h" +#include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "inc/pass_manager.h" #include "init/gelib.h" -#include "common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.h" -#include "common/formats/format_transfers/format_transfer_nchw_nc1hwc0.h" -#include "graph/utils/tensor_utils.h" +#include "multi_batch_copy_graph.h" #include "runtime/dev.h" +using ge::CheckInt64Uint32MulOverflow; + namespace ge { namespace { +static std::map output_type_str_to_datatype = { + {"FP32", ge::DT_FLOAT}, {"FP16", ge::DT_FLOAT16}, {"INT8", ge::DT_INT8}, {"INT16", ge::DT_INT16}, + {"UINT16", ge::DT_UINT16}, {"UINT8", ge::DT_UINT8}, {"INT32", ge::DT_INT32}, {"INT64", ge::DT_INT64}, + {"UINT32", ge::DT_UINT32}, {"UINT64", ge::DT_UINT64}, {"DOUBLE", ge::DT_DOUBLE}}; OpDescPtr CreateTensorShape(const GeTensorDesc &data_tensor) { GeTensorPtr tensor = MakeShared(); @@ -109,6 +118,7 @@ OpDescPtr CreateTensorShape(const GeTensorDesc &data_tensor) { for (int64_t i = 0; i < dim_cnt; ++i) { dst_shape[i] = dst_ge_shape.GetDim(static_cast(i)); } + GE_IF_BOOL_EXEC( tensor->SetData(reinterpret_cast(dst_shape.get()), dim_cnt * sizeof(int64_t)) != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "tensor set data failed"); @@ -357,49 +367,6 @@ Status RecoverTransRoadForVarRef(const std::set &nodes, const VarTransR return SUCCESS; } -Status SetDataNodeByAipp(const ge::NodePtr &node_ptr) { - GELOGI("Start to set data node by aipp."); - auto aipp_op_desc = node_ptr->GetOpDesc(); - GE_CHECK_NOTNULL(aipp_op_desc); - auto aipp_input = aipp_op_desc->MutableInputDesc(0); - GE_CHECK_NOTNULL(aipp_input); - ge::DataType aipp_dt = aipp_input->GetDataType(); - GELOGI("Aipp [%s] input datatype is %s.", aipp_op_desc->GetName().c_str(), - TypeUtils::DataTypeToSerialString(aipp_dt).c_str()); - uint32_t size = 0; - graphStatus graph_ret = ge::TensorUtils::GetSize(*aipp_input, size); - if (graph_ret != GRAPH_SUCCESS) { - GELOGE(FAILED, "UpdateOutputDesc fail, graph_ret:%u", graph_ret); - return FAILED; - } - GELOGI("Get size [%u] from aipp [%s].", size, aipp_op_desc->GetName().c_str()); - if (size == 0) { - GELOGE(FAILED, "Can not get size from aipp [%s]", aipp_op_desc->GetName().c_str()); - return FAILED; - } - for (const auto &in_data_anchor : node_ptr->GetAllInDataAnchors()) { - const auto &peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); - GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); - const auto &src_node = peer_out_anchor->GetOwnerNode(); - const auto &src_op = src_node->GetOpDesc(); - GE_IF_BOOL_EXEC(src_op == nullptr, continue); - - const GeTensorDescPtr &input = src_op->MutableInputDesc(0); - GE_CHECK_NOTNULL(input); - input->SetDataType(aipp_dt); - input->SetOriginDataType(aipp_dt); - ge::TensorUtils::SetSize(*input, size); - - const GeTensorDescPtr &output = src_op->MutableOutputDesc(0); - GE_CHECK_NOTNULL(output); - output->SetDataType(aipp_dt); - output->SetOriginDataType(aipp_dt); - ge::TensorUtils::SetSize(*output, size); - GELOGI("Set data size [%u] by aipp.", size); - } - return SUCCESS; -} - using VarNamesToRefs = std::map>; VarNamesToRefs CollectVarNamesToRefs(const ComputeGraphPtr &graph) { @@ -440,10 +407,9 @@ NodePtr CreateCastOp(const ge::GeShape &shape, const ge::DataType input_data_typ static uint32_t transop_count = 0; std::string name = std::string("cast_node").append(std::to_string(transop_count++)); - GELOGI("Create cast op:%s, input datatype:%s, out datatype:%s", name.c_str(), + GELOGI("create cast op:%s, input datatype:%s, out datatype:%s", name.c_str(), TypeUtils::DataTypeToSerialString(input_data_type).c_str(), TypeUtils::DataTypeToSerialString(output_data_type).c_str()); - GeTensorDesc input(shape, format, input_data_type); input.SetOriginFormat(format); input.SetOriginShape(shape); @@ -456,7 +422,9 @@ NodePtr CreateCastOp(const ge::GeShape &shape, const ge::DataType input_data_typ output.SetOriginDataType(output_data_type); ge::TensorUtils::SetRealDimCnt(output, static_cast(shape.GetDims().size())); - return CreateTransNode(name, CAST, input, output, node); + auto cast_node = CreateTransNode(name, CAST, input, output, node); + GELOGD("Create cast node success."); + return cast_node; } Status ProcessInputFP16(NodePtr &node_ptr) { @@ -480,12 +448,12 @@ Status ProcessInputFP16(NodePtr &node_ptr) { int64_t shape_size = desc_shape * len; input->SetDataType(DT_FLOAT16); input->SetOriginDataType(DT_FLOAT16); - ge::TensorUtils::SetSize(*input, static_cast(shape_size)); + ge::TensorUtils::SetSize(*input, shape_size); const GeTensorDescPtr &output = op_desc->MutableOutputDesc(0); GE_CHECK_NOTNULL(output); output->SetDataType(DT_FLOAT16); output->SetOriginDataType(DT_FLOAT16); - ge::TensorUtils::SetSize(*output, static_cast(shape_size)); + ge::TensorUtils::SetSize(*output, shape_size); NodePtr cast_node = CreateCastOp(output->GetShape(), DT_FLOAT16, src_dtype, output->GetFormat(), node_ptr); GE_CHECK_NOTNULL(cast_node); @@ -506,7 +474,7 @@ NodePtr CreateTransdataNode(const ge::GeShape &in_shape, const ge::Format input_ // Does not involve multithreading. std::string name = std::string("transdata_node").append(std::to_string(transop_count++)); - GELOGI("Create trandata op:%s, input format:%s, out format:%s", name.c_str(), + GELOGI("create trandata op:%s, input format:%s, out format:%s", name.c_str(), TypeUtils::FormatToSerialString(input_format).c_str(), TypeUtils::FormatToSerialString(output_format).c_str()); GeTensorDesc input(in_shape, input_format, dt); @@ -522,6 +490,24 @@ NodePtr CreateTransdataNode(const ge::GeShape &in_shape, const ge::Format input_ return CreateTransNode(name, TRANSDATA, input, output, node); } +Status TransferShape2NC1HWC0(Format src_format, const std::vector &src_shape, DataType dt, Format dst_format, + std::vector &dst_shape) { + if (src_format == FORMAT_NCHW) { + formats::FormatTransferNchwNc1hwc0 transfer; + if (transfer.TransShape(src_format, src_shape, dt, dst_format, dst_shape) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "TransShape failed"); + return FAILED; + } + } else if (src_format == FORMAT_NHWC) { + formats::FormatTransferNhwcNc1hwc0 transfer; + if (transfer.TransShape(src_format, src_shape, dt, dst_format, dst_shape) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "TransShape failed"); + return FAILED; + } + } + return SUCCESS; +} + Status ModifyInputFormatAndShape(NodePtr &node_ptr) { GE_CHECK_NOTNULL(node_ptr); auto op_desc = node_ptr->GetOpDesc(); @@ -532,43 +518,35 @@ Status ModifyInputFormatAndShape(NodePtr &node_ptr) { std::vector old_shape = input->GetShape().GetDims(); ge::DataType dt = input->GetDataType(); std::vector dst_shape_dims; - if (old_format == FORMAT_NCHW) { - formats::FormatTransferNchwNc1hwc0 transfer; - if (transfer.TransShape(old_format, old_shape, dt, FORMAT_NC1HWC0, dst_shape_dims) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "TransShape failed"); - return FAILED; - } - } else if (old_format == FORMAT_NHWC) { - formats::FormatTransferNhwcNc1hwc0 transfer; - if (transfer.TransShape(old_format, old_shape, dt, FORMAT_NC1HWC0, dst_shape_dims) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "TransShape failed"); - return FAILED; - } + if (TransferShape2NC1HWC0(old_format, old_shape, dt, FORMAT_NC1HWC0, dst_shape_dims) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Trans shape failed"); + return FAILED; } - input->SetShape(ge::GeShape(dst_shape_dims)); - input->SetOriginShape(ge::GeShape(dst_shape_dims)); input->SetFormat(FORMAT_NC1HWC0); input->SetOriginFormat(FORMAT_NC1HWC0); + input->SetShape(ge::GeShape(dst_shape_dims)); + input->SetOriginShape(ge::GeShape(dst_shape_dims)); auto output = op_desc->MutableOutputDesc(0); GE_CHECK_NOTNULL(output); - output->SetShape(ge::GeShape(dst_shape_dims)); - output->SetOriginShape(ge::GeShape(dst_shape_dims)); output->SetFormat(FORMAT_NC1HWC0); output->SetOriginFormat(FORMAT_NC1HWC0); + output->SetShape(ge::GeShape(dst_shape_dims)); + output->SetOriginShape(ge::GeShape(dst_shape_dims)); - uint32_t size = 0; + int64_t size = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(*output, size); if (graph_status != ge::GRAPH_SUCCESS) { GELOGE(graph_status, "GetTensorSizeInBytes failed!"); return FAILED; } - ge::TensorUtils::SetSize(*input, size); ge::TensorUtils::SetSize(*output, size); + ge::TensorUtils::SetSize(*input, size); return SUCCESS; } + Status ProcessInputNC1HWC0(NodePtr &node_ptr) { GE_CHECK_NOTNULL(node_ptr); auto op_desc = node_ptr->GetOpDesc(); @@ -603,6 +581,161 @@ Status ProcessInputNC1HWC0(NodePtr &node_ptr) { } return SUCCESS; } + +Status ProcessDataNode(NodePtr &node_ptr) { + bool set_fp16 = false; + if (!ge::AttrUtils::GetBool(node_ptr->GetOpDesc(), "input_fp16", set_fp16) || !set_fp16) { + return SUCCESS; + } + for (auto const &next_node : node_ptr->GetOutNodes()) { + if (next_node->GetType() == AIPP) { + GELOGE(INTERNAL_ERROR, + "This input node [%s] is linked to aipp, can not be set to fp16," + "please check your atc parma insert_op_conf, input_fp16_nodes.", + node_ptr->GetName().c_str()); + return FAILED; + } + } + GELOGI("input_fp16 is found, the node name is %s", node_ptr->GetName().c_str()); + if (ProcessInputFP16(node_ptr) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "ProcessInputFP16 failed"); + return FAILED; + } + // check if need to set format + bool set_format = false; + if (!ge::AttrUtils::GetBool(node_ptr->GetOpDesc(), "input_set_nc1hwc0", set_format) || !set_format) { + return SUCCESS; + } + GELOGI("The format of node [%s] should be set NC1HWC0", node_ptr->GetName().c_str()); + if (ProcessInputNC1HWC0(node_ptr) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "ProcessInputNC1HWC0 failed"); + return FAILED; + } + return SUCCESS; +} + +Status ProcessNetoutputNodeFp16Nc1hwc0(GeTensorDesc &src_desc, const InDataAnchorPtr &in_anchor, + GeTensorDescPtr &net_output_input_desc, NodePtr &node) { + ge::GeShape src_shape = src_desc.GetShape(); + ge::Format src_format = src_desc.GetFormat(); + ge::DataType src_dtype = src_desc.GetDataType(); + if (src_dtype != DT_FLOAT16) { + auto peer_out = in_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out); + NodePtr cast_node = CreateCastOp(src_shape, src_dtype, DT_FLOAT16, src_format, node); + GE_CHECK_NOTNULL(cast_node); + if (GraphUtils::InsertNodeBetweenDataAnchors(peer_out, in_anchor, cast_node) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "InsertNodeBetweenDataAnchors failed"); + return FAILED; + } + net_output_input_desc->SetDataType(DT_FLOAT16); + net_output_input_desc->SetOriginDataType(DT_FLOAT16); + } + if (src_format == FORMAT_NC1HWC0) { + GELOGI("Format is NC1HWC0, no need to transfer"); + return SUCCESS; + } + std::vector dst_shape_dims; + std::vector src_shape_dims = src_shape.GetDims(); + if (TransferShape2NC1HWC0(src_format, src_shape_dims, DT_FLOAT16, FORMAT_NC1HWC0, dst_shape_dims) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Trans shape failed"); + return FAILED; + } + ge::GeShape dst_shape(dst_shape_dims); + NodePtr trans_node = CreateTransdataNode(src_shape, src_format, dst_shape, FORMAT_NC1HWC0, DT_FLOAT16, node); + GE_CHECK_NOTNULL(trans_node); + auto peer_out_new = in_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_new); + if (GraphUtils::InsertNodeBetweenDataAnchors(peer_out_new, in_anchor, trans_node) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "InsertNodeBetweenDataAnchors failed"); + return FAILED; + } + net_output_input_desc->SetFormat(FORMAT_NC1HWC0); + net_output_input_desc->SetOriginFormat(FORMAT_NC1HWC0); + net_output_input_desc->SetShape(dst_shape); + net_output_input_desc->SetOriginShape(dst_shape); + return SUCCESS; +} + +Status ProcessNetoutputNode(NodePtr &node, std::string &output_type) { + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + ge::DataType output_data_type = ge::DT_FLOAT; + bool is_set_output_type = false; + if (output_type_str_to_datatype.find(output_type) != output_type_str_to_datatype.end()) { + output_data_type = output_type_str_to_datatype[output_type]; + is_set_output_type = true; + } else { + GELOGI("output_type [%s] is not set or set unexpected", output_type.c_str()); + is_set_output_type = false; + } + + for (const auto &in_anchor : node->GetAllInDataAnchors()) { + auto index = static_cast(in_anchor->GetIdx()); + auto peer_out = in_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out); + auto src_index = static_cast(peer_out->GetIdx()); + auto own_node = peer_out->GetOwnerNode(); + OpDescPtr src_op_desc = own_node->GetOpDesc(); + GE_CHECK_NOTNULL(src_op_desc); + auto net_output_input_desc = op_desc->MutableInputDesc(index); + GE_CHECK_NOTNULL(net_output_input_desc); + auto net_output_output_desc = op_desc->MutableOutputDesc(index); + GE_CHECK_NOTNULL(net_output_output_desc); + // Update netoutput outputdesc + net_output_output_desc->SetDataType(net_output_input_desc->GetDataType()); + net_output_output_desc->SetOriginDataType(net_output_input_desc->GetDataType()); + net_output_output_desc->SetFormat(net_output_input_desc->GetFormat()); + net_output_output_desc->SetOriginFormat(net_output_input_desc->GetFormat()); + net_output_output_desc->SetShape(net_output_input_desc->GetShape()); + net_output_output_desc->SetOriginShape(net_output_input_desc->GetShape()); + + ge::GeShape src_shape = src_op_desc->GetOutputDesc(src_index).GetShape(); + ge::Format src_format = src_op_desc->GetOutputDesc(src_index).GetFormat(); + ge::DataType src_dtype = src_op_desc->GetOutputDesc(src_index).GetDataType(); + // Update datatype + if (is_set_output_type) { + GELOGI("Enter into process output_type schedule"); + if (src_dtype == output_data_type) { + GELOGI("Data type is same ,no need to transfer."); + continue; + } + NodePtr cast_node = CreateCastOp(src_shape, src_dtype, output_data_type, src_format, node); + if (GraphUtils::InsertNodeBetweenDataAnchors(peer_out, in_anchor, cast_node) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "InsertNodeBetweenDataAnchors failed"); + return FAILED; + } + net_output_input_desc->SetDataType(output_data_type); + net_output_input_desc->SetOriginDataType(output_data_type); + net_output_output_desc->SetDataType(output_data_type); + net_output_output_desc->SetOriginDataType(output_data_type); + continue; + } + // output_node is not set,check if is_output_adjust_hw_layout is set + bool set_fp16_nc1hwc0 = false; + if (AttrUtils::GetBool(src_op_desc, "output_set_fp16_nc1hwc0", set_fp16_nc1hwc0)) { + if (set_fp16_nc1hwc0) { + GELOGI("Node [%s] should be set FP16 and NC1HWC0", src_op_desc->GetName().c_str()); + if ((src_format != FORMAT_NCHW) && (src_format != FORMAT_NHWC) && (src_format != FORMAT_NC1HWC0)) { + GELOGE(INTERNAL_ERROR, "Format is not one of NCHW, NHWC, NC1HWC0."); + return FAILED; + } + GeTensorDesc src_desc(src_shape, src_format, src_dtype); + if (ProcessNetoutputNodeFp16Nc1hwc0(src_desc, in_anchor, net_output_input_desc, node) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Process netoutput fp16 nc1hwc0."); + return FAILED; + } + net_output_output_desc->SetDataType(net_output_input_desc->GetDataType()); + net_output_output_desc->SetOriginDataType(net_output_input_desc->GetDataType()); + net_output_output_desc->SetFormat(net_output_input_desc->GetFormat()); + net_output_output_desc->SetOriginFormat(net_output_input_desc->GetFormat()); + net_output_output_desc->SetShape(net_output_input_desc->GetShape()); + net_output_output_desc->SetOriginShape(net_output_input_desc->GetShape()); + } + } + } + return SUCCESS; +} } // namespace GraphPrepare::GraphPrepare() : compute_graph_(nullptr) {} @@ -661,6 +794,12 @@ Status GraphPrepare::Init(const ge::Graph &graph, uint64_t session_id) { GELOGE(ret, "RunGraph graph check fail, ret:%u", ret); return ret; } + compute_graph_->TopologicalSorting(); + ret = CheckRefOp(); + if (ret != SUCCESS) { + GELOGE(ret, "RunGraph check ref op fail, ret:%u", ret); + return ret; + } return SUCCESS; } @@ -685,8 +824,83 @@ Status GraphPrepare::CheckGraph() { return SUCCESS; } +Status GraphPrepare::CheckRefInputNode(const NodePtr &node, const std::string &input_name, + const std::unordered_set &ref_nodes) { + static std::unordered_set acceptable_types = {ge::VARIABLE, ge::VARIABLEV2, ge::VARHANDLEOP, + ge::REFSWITCH, ge::REFMERGE, ge::REFENTER, + ge::REFNEXTITERATION, ge::REFEXIT}; + GE_CHECK_NOTNULL(node); + const auto &op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + const auto input_index = op_desc->GetInputIndexByName(input_name); + const auto &in_anchor = node->GetInDataAnchor(input_index); + GE_CHECK_NOTNULL(in_anchor); + const auto &peer_out_anchor = in_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_anchor); + const auto &input_node = peer_out_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(input_node); + const auto &input_op_desc = input_node->GetOpDesc(); + GE_CHECK_NOTNULL(input_op_desc); + + bool is_ref = (ref_nodes.find(input_node) != ref_nodes.end()); + if (is_ref) { + return SUCCESS; + } + auto input_type = input_op_desc->GetType(); + if (input_type == ge::FRAMEWORKOP) { + if (!ge::AttrUtils::GetStr(input_op_desc, ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, input_type)) { + GELOGE(PARAM_INVALID, "Get original type failed."); + return PARAM_INVALID; + } + } + bool is_acceptable = (acceptable_types.find(input_type) != acceptable_types.end()); + + if (!is_acceptable) { + GELOGE(PARAM_INVALID, "The ref input of ref node %s[%s] must be ref node or variable, but %s[%s]isn't.", + node->GetName().c_str(), node->GetType().c_str(), input_op_desc->GetName().c_str(), + input_op_desc->GetType().c_str()); + return PARAM_INVALID; + } + + return SUCCESS; +} + +Status GraphPrepare::CheckRefOp() { + GE_CHECK_NOTNULL(compute_graph_); + std::unordered_set ref_nodes; + for (const NodePtr &node : compute_graph_->GetDirectNode()) { + if (node == nullptr) { + GELOGE(PARAM_INVALID, "param [node] must not be null."); + return PARAM_INVALID; + } + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + GELOGE(PARAM_INVALID, "OpDesc of param [node] must not be null."); + return PARAM_INVALID; + } + + auto input_names = op_desc->GetAllInputNames(); + auto outputs = op_desc->GetAllOutputName(); + std::unordered_set all_output_name; + + for (auto &output : outputs) { + all_output_name.insert(output.first); + } + for (const auto &input_name : input_names) { + if (all_output_name.find(input_name) != all_output_name.end()) { + if (CheckRefInputNode(node, input_name, ref_nodes) != SUCCESS) { + GELOGE(PARAM_INVALID, "CheckRefInputNode failed."); + return PARAM_INVALID; + } + (void)ref_nodes.insert(node); + } + } + } + return SUCCESS; +}; + Status GraphPrepare::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode) { - GELOGI("Set rt_context %d, device id:%u.", static_cast(mode), ge::GetContext().DeviceId()); + GELOGI("set rt_context %d, device id:%u.", static_cast(mode), ge::GetContext().DeviceId()); GE_CHK_RT_RET(rtCtxCreate(&rt_context, mode, ge::GetContext().DeviceId())); GE_CHK_RT_RET(rtCtxSetCurrent(rt_context)); RtContextUtil::GetInstance().AddrtContext(rt_context); @@ -704,7 +918,7 @@ Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) { return GE_GRAPH_GRAPH_NODE_NULL; } GeTensorDesc output = op_desc_ptr->GetOutputDesc(0); - uint32_t tensor_size = 0; + int64_t tensor_size = 0; graphStatus graph_status = TensorUtils::GetTensorMemorySizeInBytes(output, tensor_size); if (graph_status != GRAPH_SUCCESS) { GELOGE(graph_status, "GetTensorMemorySizeInBytes failed!"); @@ -721,13 +935,13 @@ Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) { Status GraphPrepare::UpdateInput(const std::vector &user_input) { compute_graph_->SaveDataFormat((ge::Format)(domi::GetContext().format)); - for (NodePtr &input_node : compute_graph_->GetAllNodes()) { + for (NodePtr &input_node : compute_graph_->GetDirectNode()) { GE_CHECK_NOTNULL(input_node); OpDescPtr op = input_node->GetOpDesc(); GE_CHECK_NOTNULL(op); if (op->GetType() == DATA) { GeAttrValue::INT index = 0; - if (!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) { + if ((!(AttrUtils::GetInt(op, ATTR_NAME_INDEX, index))) || (domi::GetContext().is_dynamic_input)) { GELOGW("Get index from data attr failed"); continue; } @@ -759,15 +973,16 @@ Status GraphPrepare::UpdateInput(const std::vector &user_input) { FMK_INT64_UINT32_MULCHECK(desc_shape, length); int64_t shape_size = desc_shape * length; GE_IF_BOOL_EXEC(shape_size == 0, shape_size = static_cast(length)); - uint32_t size = 0; - // [No need to check return value] - ge::TensorUtils::GetSize(desc, size); - if ((size != 0) && (shape_size != static_cast(size))) { - GELOGE(PARAM_INVALID, "input data size =%u, shape_size =%ld.", size, shape_size); + int64_t size = 0; + GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(desc, size) != GRAPH_SUCCESS, + GELOGE(INTERNAL_ERROR, "TensorUtils GetSize failed"); + return FAILED); + if ((size != 0) && (shape_size != size)) { + GELOGE(PARAM_INVALID, "input data size =%ld, shape_size =%ld.", size, shape_size); return FAILED; } - ge::TensorUtils::SetSize(desc, static_cast(shape_size)); + ge::TensorUtils::SetSize(desc, shape_size); graphStatus graph_ret = op->UpdateInputDesc(0, desc); if (graph_ret != GRAPH_SUCCESS) { @@ -883,36 +1098,26 @@ Status GraphPrepare::ResourcePairProcess(const std::string &action) { return SUCCESS; } -Status GraphPrepare::OptimizeForDataAfterInfershape() { - for (auto node_ptr : compute_graph_->GetAllNodes()) { +Status GraphPrepare::OptimizeAfterInfershapeByAtcParams() { + if (options_.train_graph_flag) { + GELOGI("This is train mode, no need to do this schedule."); + return SUCCESS; + } + GE_RETURN_IF_ERROR(InsertNewOpUtil::Instance().UpdateDataNodeByAipp(compute_graph_)); + for (auto &node_ptr : compute_graph_->GetDirectNode()) { GE_CHECK_NOTNULL(node_ptr); - if (node_ptr->GetType() == AIPP) { - if (SetDataNodeByAipp(node_ptr) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Set data node by aipp failed"); + if (node_ptr->GetType() == DATA) { + if (ProcessDataNode(node_ptr) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Process data node failed"); return FAILED; } } - if (node_ptr->GetType() != DATA) { - continue; - } - bool set_fp16 = false; - if (!ge::AttrUtils::GetBool(node_ptr->GetOpDesc(), "input_fp16", set_fp16) || !set_fp16) { - continue; - } - GELOGI("input_node_set_fp16 is found, the name is %s", node_ptr->GetName().c_str()); - if (ProcessInputFP16(node_ptr) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "ProcessInputFP16 failed"); - return FAILED; - } - // check if need to set format - bool set_format = false; - if (!ge::AttrUtils::GetBool(node_ptr->GetOpDesc(), "input_set_nc1hwc0", set_format) || !set_format) { - continue; - } - GELOGI("Find a node [%s] should set NC1HWC0", node_ptr->GetName().c_str()); - if (ProcessInputNC1HWC0(node_ptr) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "ProcessInputNC1HWC0 failed"); - return FAILED; + + if (node_ptr->GetType() == ge::NETOUTPUT) { + if (ProcessNetoutputNode(node_ptr, options_.output_datatype) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Process netoutput node failed"); + return FAILED; + } } } return SUCCESS; @@ -934,6 +1139,7 @@ void GraphPrepare::ProcessCCEFormat() { if (org_tensor_input.GetFormat() == FORMAT_ND) { org_tensor_input.SetFormat(FORMAT_NCHW); org_tensor_input.SetOriginFormat(FORMAT_NCHW); + // [No need to check value] (void)node_op_desc->UpdateInputDesc(i, org_tensor_input); } } @@ -944,6 +1150,7 @@ void GraphPrepare::ProcessCCEFormat() { if (org_tensor_output.GetFormat() == FORMAT_ND) { org_tensor_output.SetFormat(FORMAT_NCHW); org_tensor_output.SetOriginFormat(FORMAT_NCHW); + // [No need to check value] (void)node_op_desc->UpdateOutputDesc(i, org_tensor_output); } } @@ -982,7 +1189,7 @@ Status GraphPrepare::OptimizeBeforeInfershape() { } void GraphPrepare::SaveOriginalGraphToOmModel() { - if (options_.save_original_model) { + if (options_.save_original_model == "true") { ModelHelper model_helper; Status ret = model_helper.SaveOriginalGraphToOmModel(ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph_), options_.original_model_file); @@ -1029,16 +1236,28 @@ Status GraphPrepare::Preprocess(const std::vector &user_input) { return FAILED; } } - ret = TryDoAipp(); + + GE_TIMESTAMP_START(netoutput_process); + ret = ProcessNetOutput(); + GE_TIMESTAMP_END(netoutput_process, "GraphPrepare::NetOutputProcess") if (ret != SUCCESS) { return ret; } + GE_TIMESTAMP_START(multibatch_process); + ret = multibatch::ProcessMultiBatch(compute_graph_); + GE_TIMESTAMP_END(multibatch_process, "GraphPrepare::MultiBatchProcess") + if (ret != SUCCESS) { + GELOGE(ret, "Failed to do multi-batch processing"); + return ret; + } + GraphUtils::DumpGEGraph(compute_graph_, "after_multibatch_process"); + GraphUtils::DumpGEGraphToOnnx(*compute_graph_, "after_multibatch_process"); - ret = OptimizeBeforeInfershape(); + ret = TryDoAipp(); if (ret != SUCCESS) { - GELOGE(ret, "OptimizeBeforeInfershape failed."); return ret; } + GE_TIMESTAMP_START(FormatAndShapeProcess); ret = FormatAndShapeProcess(); GE_TIMESTAMP_END(FormatAndShapeProcess, "GraphPrepare::FormatAndShapeProcess"); @@ -1051,7 +1270,7 @@ Status GraphPrepare::Preprocess(const std::vector &user_input) { ProcessCCEFormat(); - ret = OptimizeForDataAfterInfershape(); + ret = OptimizeAfterInfershapeByAtcParams(); if (ret != SUCCESS) { GELOGE(ret, "Optimize for input if set inputfp16 failed."); return ret; @@ -1139,6 +1358,13 @@ Status GraphPrepare::Prepare(ConstGraphPtr graph, const std::vector &u return ret; } + GE_TIMESTAMP_START(OptimizeBeforeSubGraph); + ret = OptimizeGraphBeforeSubGraph(); + GE_TIMESTAMP_END(OptimizeBeforeSubGraph, "GraphPrepare::OptimizeBeforeSubGraph"); + if (ret != SUCCESS) { + GELOGE(ret, "originalGraph optimize Failed"); + return ret; + } compute_graph = compute_graph_; return SUCCESS; } @@ -1198,12 +1424,12 @@ Status GraphPrepare::VerifyConstOp(const NodePtr &node) { } Status GraphPrepare::CheckUserInput(const std::vector &user_input) { - if (user_input.empty()) { + if (user_input.empty() || domi::GetContext().is_dynamic_input) { return SUCCESS; } unsigned int node_num = 0; unsigned int data_num = 0; - for (NodePtr &input_node : compute_graph_->GetAllNodes()) { + for (NodePtr &input_node : compute_graph_->GetDirectNode()) { GE_CHECK_NOTNULL(input_node); OpDescPtr op = input_node->GetOpDesc(); GE_CHECK_NOTNULL(op); @@ -1246,6 +1472,8 @@ Status GraphPrepare::InferShapeForPreprocess() { } InferShapePass infer_shape_pass; names_to_passes.emplace_back("InferShapePass", &infer_shape_pass); + ReplaceWithEmptyConstPass replace_with_empty_const_pass; + names_to_passes.emplace_back("ReplaceWithEmptyConstPass", &replace_with_empty_const_pass); DimensionComputePass dimension_compute_pass; names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass); ConstantFoldingPass constant_folding_pass; @@ -1285,7 +1513,6 @@ Status GraphPrepare::InferShapeForPreprocess() { Status GraphPrepare::OptimizeForPreprocess() { GELOGI("Start optimize for preprocess."); - PassManager original_graph_passes; // Graph pass try { @@ -1322,9 +1549,7 @@ Status GraphPrepare::OptimizeForPreprocess() { // for infer DropOutPass dropout_pass; AssertPass assert_pass; - ReUpdateNetOutputPass re_update_net_output_pass; if (!options_.train_graph_flag) { - names_to_passes.emplace_back("ReUpdateNetOutputPass", &re_update_net_output_pass); names_to_passes.emplace_back("DropOutPass", &dropout_pass); names_to_passes.emplace_back("AssertPass", &assert_pass); } @@ -1390,7 +1615,7 @@ Status GraphPrepare::OptimizeForPreprocess() { } // The constant for train is CONSTANTOP, and is CONSTANT for inference. They will be unified in future. if (options_.train_graph_flag) { - for (ge::NodePtr &n : compute_graph_->GetDirectNode()) { + for (ge::NodePtr &n : compute_graph_->GetAllNodes()) { // This can ensure that n is not a null pointer if (n->GetOpDesc()->GetType() == CONSTANT) { n->GetOpDesc()->SetType(CONSTANTOP); @@ -1408,4 +1633,43 @@ Status GraphPrepare::OptimizeForPreprocess() { return SUCCESS; } + +Status GraphPrepare::ProcessNetOutput() { + PassManager graph_passes_before_infershape; + try { + if (options_.train_graph_flag) { + graph_passes_before_infershape.AddPass(new (std::nothrow) SavePass); + } + graph_passes_before_infershape.AddPass(new (std::nothrow) NetOutputPass); + } catch (std::bad_alloc) { + GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); + return INTERNAL_ERROR; + } + + auto ret = graph_passes_before_infershape.Run(compute_graph_); + if ((ret != SUCCESS) && (ret != NOT_CHANGED)) { + GELOGE(ret, "Run graph_passes_before_infershape failed, ret:%d.", ret); + return ret; + } + return SUCCESS; +} +Status GraphPrepare::OptimizeGraphBeforeSubGraph() { + PassManager passes; + (void)passes.AddPass(new (std::nothrow) CommonSubexpressionEliminationPass); + auto ret = passes.Run(compute_graph_); + if (ret != SUCCESS) { + GELOGE(ret, "Failed to optimize for graph"); + return ret; + } + ConstantFoldingPass constant_folding_pass; + NamesToPass names_to_passes; + names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); + GEPass ge_passes(compute_graph_); + ret = ge_passes.Run(names_to_passes); + if (ret != SUCCESS) { + GELOGE(ret, "Failed to optimize for graph"); + return ret; + } + return SUCCESS; +} } // namespace ge diff --git a/src/ge/graph/preprocess/graph_preprocess.h b/src/ge/graph/preprocess/graph_preprocess.h index c53edf43..002d45ab 100644 --- a/src/ge/graph/preprocess/graph_preprocess.h +++ b/src/ge/graph/preprocess/graph_preprocess.h @@ -52,6 +52,9 @@ class GraphPrepare { Status Init(const ge::Graph &graph, uint64_t session_id = 0); Status Preprocess(const std::vector &user_input); Status CheckGraph(); + Status CheckRefInputNode(const NodePtr &node, const std::string &input_name, + const std::unordered_set &ref_nodes); + Status CheckRefOp(); Status SetRtContext(rtContext_t rt_context, rtCtxMode_t mode); Status AdjustDataOpOutput(const NodePtr &node); Status UpdateInput(const std::vector &user_input); @@ -61,13 +64,15 @@ class GraphPrepare { Status OptimizeForPreprocess(); Status InferShapeForPreprocess(); Status TryDoAipp(); - Status OptimizeForDataAfterInfershape(); + Status OptimizeAfterInfershapeByAtcParams(); Status UpdateVariableFormats(ComputeGraphPtr &graph); Status FormatAndShapeProcess(); Status ResourcePairProcess(const std::string &action); void ProcessCCEFormat(); Status OptimizeBeforeInfershape(); + Status OptimizeGraphBeforeSubGraph(); void SaveOriginalGraphToOmModel(); + Status ProcessNetOutput(); ge::ComputeGraphPtr compute_graph_; GraphManagerOptions options_; }; diff --git a/src/ge/graph/preprocess/insert_op/base_insert_op.cc b/src/ge/graph/preprocess/insert_op/base_insert_op.cc deleted file mode 100644 index 37bbd48b..00000000 --- a/src/ge/graph/preprocess/insert_op/base_insert_op.cc +++ /dev/null @@ -1,222 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "graph/preprocess/insert_op/base_insert_op.h" -#include -#include -#include "external/graph/operator_factory.h" -#include "external/graph/operator.h" -#include "framework/common/debug/ge_log.h" -#include "common/op/attr_value_util.h" -#include "common/op/ge_op_utils.h" -#include "common/types.h" -#include "common/util.h" -#include "framework/common/ge_inner_error_codes.h" -#include "graph/op_desc.h" -#include "graph/utils/graph_utils.h" -#include "graph/utils/op_desc_utils.h" -#include "graph/utils/tensor_utils.h" -#include "common/ge/ge_util.h" -#include "graph/debug/ge_attr_define.h" -#include "common/math/math_util.h" - -namespace ge { -static const char *const kAippConfigPath = "aipp_config_route"; -static const uint32_t kImageRatioYuv420SpU8Mul = 3; -static const uint32_t kImageRatioYuv420SpU8Div = 2; -static const uint32_t kImageRatioXrgb8888U8 = 4; -static const uint32_t kImageRatioRgb888U8 = 3; - -Status InsertOpBase::InsertAippToGraph(ComputeGraphPtr &graph, std::string &aippConfigPath, - ge::NodePtr &inserted_aipp_node) { - GE_CHECK_NOTNULL(graph); - NodePtr target_input = nullptr; - std::vector> target_edges; - GE_CHK_STATUS_RET(this->GetTargetPosition(graph, target_input, target_edges), "Get data nodes position failed"); - OpDescPtr op_desc = ge::MakeShared("", ""); - if (op_desc == nullptr) { - return FAILED; - } - GE_CHK_STATUS_RET(this->GenerateOpDesc(op_desc), "Generate aipp node opdesc failed"); - ge::GeAttrValue::NamedAttrs aipp_attr; - GE_IF_BOOL_EXEC(!AttrUtils::GetNamedAttrs(op_desc, ATTR_NAME_AIPP, aipp_attr), - GELOGW("InsertAippToGraph: GetNamedAttrs failed"); - return FAILED) - - auto opdesc_src_data = target_input->GetOpDesc()->GetOutputDesc(0); // [Cascade pointer] - if (opdesc_src_data.GetDataType() != DT_FLOAT) { - GELOGW("The datatype of data node %s is not FP32", target_input->GetName().c_str()); - opdesc_src_data.SetDataType(DT_FLOAT); - } - - static uint32_t op_idx = 0; - // Does not involve multithreading. - std::string current_name = std::string("aipp_node").append(std::to_string(op_idx++)); - auto aipp_op = ge::OperatorFactory::CreateOperator(current_name, "Aipp"); - GE_CHK_BOOL_RET_STATUS(!aipp_op.IsEmpty(), PARAM_INVALID, "Aipp is not registered"); - auto aipp_opdesc_ptr = ge::OpDescUtils::GetOpDescFromOperator(aipp_op); - GE_CHECK_NOTNULL(aipp_opdesc_ptr); - GE_IF_BOOL_EXEC(!AttrUtils::SetNamedAttrs(aipp_opdesc_ptr, ATTR_NAME_AIPP, aipp_attr), - GELOGE(FAILED, "SetNameAttrs failed"); - return FAILED;) - - unique_ptr aipp_params(new (std::nothrow) domi::AippOpParams()); - GE_CHECK_NOTNULL(aipp_params); - GE_CHK_STATUS_RET(ge::OpUtils::ConvertAippParams(aipp_attr, aipp_params.get()), "Get aipp params failed") - GE_CHK_STATUS_RET(aipp_opdesc_ptr->UpdateInputDesc(0, opdesc_src_data)) - - if (aipp_params->aipp_mode() == domi::AippOpParams::dynamic) { - Status ret = aipp_opdesc_ptr->UpdateInputDesc(1, opdesc_src_data); - if (ret != SUCCESS) { - return FAILED; - } - } - GE_IF_BOOL_EXEC(!AttrUtils::SetStr(aipp_opdesc_ptr, kAippConfigPath, aippConfigPath), - GELOGW("SetStr kAippConfigPath failed");) - GELOGI("Aipp config path is %s", aippConfigPath.c_str()); - - // for data dump - GE_IF_BOOL_EXEC( - !AttrUtils::SetListStr(aipp_opdesc_ptr, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, std::move(std::vector())), - GELOGW("InsertAippToGraph: SetListStr failed");) - - NodePtr insert_op = graph->AddNode(aipp_opdesc_ptr); - GE_CHECK_NOTNULL(insert_op); - OutDataAnchorPtr target_input_out = target_input->GetOutDataAnchor(0); - GE_CHECK_NOTNULL(target_input_out); - InDataAnchorPtr insert_op_in = insert_op->GetInDataAnchor(0); - GE_CHECK_NOTNULL(insert_op_in); - OutDataAnchorPtr insert_op_out = insert_op->GetOutDataAnchor(0); - GE_CHECK_NOTNULL(insert_op_out); - - inserted_aipp_node = insert_op; - if (target_edges.size() == 1) { - OutDataAnchorPtr src_out = target_edges[0].first; - InDataAnchorPtr dst_in = target_edges[0].second; - GE_CHK_STATUS_RET(GraphUtils::InsertNodeBetweenDataAnchors(src_out, dst_in, insert_op)) - return SUCCESS; - } - for (auto &edge : target_edges) { - OutDataAnchorPtr src_out = edge.first; - GE_CHECK_NOTNULL(src_out); - InDataAnchorPtr dst_in = edge.second; - GE_CHK_STATUS_RET(src_out->Unlink(dst_in), "Unlink the anchor failed"); - GE_CHK_STATUS_RET(insert_op_out->LinkTo(dst_in), "Link the anchor failed"); - } - GE_CHK_STATUS_RET(target_input_out->LinkTo(insert_op_in), "Link the anchor failed"); - return SUCCESS; -} - -uint32_t InsertOpBase::AdjustDataSize(const GeTensorDesc &input_desc, unique_ptr &aipp_params) { - GE_CHECK_NOTNULL(aipp_params); - if (aipp_params->aipp_mode() == domi::AippOpParams::static_) { - uint32_t size = input_desc.GetShape().GetDim(NCHW_DIM_N); - const uint32_t h = (input_desc.GetFormat() == ge::FORMAT_NHWC) ? NHWC_DIM_H : NCHW_DIM_H; - const uint32_t w = (input_desc.GetFormat() == ge::FORMAT_NHWC) ? NHWC_DIM_W : NCHW_DIM_W; - const uint32_t shape_h = - aipp_params->src_image_size_h() ? aipp_params->src_image_size_h() : input_desc.GetShape().GetDim(h); - FMK_UINT32_MULCHECK(size, shape_h); - size *= shape_h; - const uint32_t shape_w = - aipp_params->src_image_size_w() ? aipp_params->src_image_size_w() : input_desc.GetShape().GetDim(w); - FMK_UINT32_MULCHECK(size, shape_w); - size *= shape_w; - if (aipp_params->input_format() == domi::AippOpParams::YUV420SP_U8) { - FMK_UINT32_MULCHECK((size / kImageRatioYuv420SpU8Div), kImageRatioYuv420SpU8Mul); - size = size / kImageRatioYuv420SpU8Div * kImageRatioYuv420SpU8Mul; // avoid use float - } else if (aipp_params->input_format() == domi::AippOpParams::XRGB8888_U8) { - FMK_UINT32_MULCHECK(size, kImageRatioXrgb8888U8); - size *= kImageRatioXrgb8888U8; - } else if (aipp_params->input_format() == domi::AippOpParams::RGB888_U8) { - FMK_UINT32_MULCHECK(size, kImageRatioRgb888U8); - size *= kImageRatioRgb888U8; - } - return size; - } else { - return aipp_params->max_src_image_size(); - } -} - -Status InsertOpBase::InsertOpToGraph(ComputeGraphPtr graph) { - GE_CHECK_NOTNULL(graph); - NodePtr target_input = nullptr; - std::vector> target_edges; - GE_CHK_STATUS_RET(this->GetTargetPosition(graph, target_input, target_edges), "Get nodes position failed"); - - // insertOp - OpDescPtr op_desc = MakeShared("", ""); - if (op_desc == nullptr) { - return FAILED; - } - GE_CHK_STATUS_RET(this->GenerateOpDesc(op_desc), "Generate aipp node failed"); - NodePtr insert_op = graph->AddNode(op_desc); - GE_CHECK_NOTNULL(insert_op); - OutDataAnchorPtr target_input_out = target_input->GetOutDataAnchor(0); - GE_CHECK_NOTNULL(target_input_out); - InDataAnchorPtr insert_op_in = insert_op->GetInDataAnchor(0); - GE_CHECK_NOTNULL(insert_op_in); - OutDataAnchorPtr insert_op_out = insert_op->GetOutDataAnchor(0); - GE_CHECK_NOTNULL(insert_op_out); - - if (target_edges.size() == 1) { - OutDataAnchorPtr src_out = target_edges[0].first; - InDataAnchorPtr dst_in = target_edges[0].second; - GE_CHK_STATUS_RET(GraphUtils::InsertNodeBetweenDataAnchors(src_out, dst_in, insert_op), - "InsertNodeBetweenDataAnchors failed"); - - return SUCCESS; - } - - for (auto &edge : target_edges) { - OutDataAnchorPtr src_out = edge.first; - GE_CHECK_NOTNULL(src_out); - InDataAnchorPtr dst_in = edge.second; - - GE_CHK_STATUS_RET(src_out->Unlink(dst_in), "Unlink the anchor failed"); - - GE_CHK_STATUS_RET(insert_op_out->LinkTo(dst_in), "Link the anchor failed"); - } - - GE_CHK_STATUS_RET(target_input_out->LinkTo(insert_op_in), "Link the anchor failed"); - - return SUCCESS; -} - -Status InsertOpBase::GetInputNode(ComputeGraphPtr graph, NodePtr &target_input, uint32_t rank) { - GE_CHECK_NOTNULL(graph); - std::vector input_nodes; - - for (ge::NodePtr &node : graph->GetAllNodes()) { - GE_CHECK_NOTNULL(node); - - ge::OpDescPtr op = node->GetOpDesc(); - GE_CHECK_NOTNULL(op); - - if (op->GetType() == DATA_TYPE) { - GE_CHK_BOOL_RET_STATUS(node->GetOutDataNodes().size() > 0, FAILED, "Data node %s has no output", - node->GetName().c_str()); - input_nodes.push_back(node); - } - } - - GE_CHK_BOOL_RET_STATUS(rank < input_nodes.size(), PARAM_INVALID, - "Get intput of index %d failed, There is %zu input nodes", rank, input_nodes.size()); - - target_input = input_nodes[rank]; - - return SUCCESS; -} -} // namespace ge diff --git a/src/ge/graph/preprocess/insert_op/base_insert_op.h b/src/ge/graph/preprocess/insert_op/base_insert_op.h index 57a39867..f482e34b 100644 --- a/src/ge/graph/preprocess/insert_op/base_insert_op.h +++ b/src/ge/graph/preprocess/insert_op/base_insert_op.h @@ -45,21 +45,13 @@ class InsertOpBase { /// virtual Status ValidateParams() = 0; - /// - /// @ingroup ge_omg - /// @brief Insert the insert_op operator into the network graph - /// @param [in] graph - /// - virtual Status InsertOpToGraph(ge::ComputeGraphPtr graph); - /// /// @ingroup ge_omg /// @brief Insert aipp operator into the network graph /// @param [in] graph /// @param [in] aippConfigPath aipp /// - virtual Status InsertAippToGraph(ge::ComputeGraphPtr &graph, std::string &aippConfigPath, - ge::NodePtr &inserted_aipp_node); + virtual Status InsertAippToGraph(ge::ComputeGraphPtr &graph, std::string &aippConfigPath, const uint32_t index) = 0; /// /// @ingroup ge_omg @@ -69,18 +61,6 @@ class InsertOpBase { virtual domi::AippOpParams::AippMode GetAippMode() = 0; protected: - /// - /// @ingroup ge_omg - /// @brief Get the input operator in the model - /// - static Status GetInputNode(ge::ComputeGraphPtr graph, ge::NodePtr &target_input, uint32_t rank); - - /// - /// @ingroup ge_omg - /// @brief Get the size of data bases on the input - /// - uint32_t AdjustDataSize(const ge::GeTensorDesc &desc, std::unique_ptr &aipp_params); - /// /// @ingroup ge_omg /// @brief Generate the insert_op operator diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc index 026aff40..d35bd84c 100644 --- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.cc @@ -19,10 +19,25 @@ #include #include #include +#include +#include "proto/insert_op.pb.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" +#include "graph/utils/type_utils.h" #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" +#include "framework/common/op/ge_op_utils.h" +#include "framework/common/types.h" +#include "framework/omg/omg_inner_types.h" +#include "common/dynamic_aipp.h" +#include "common/ge/ge_util.h" #include "common/util.h" #include "graph/optimize/common/params.h" +#include "external/graph/operator_factory.h" +#include "base_insert_op.h" #define SAVE_AIPP_ATTR(KEY, SAVE_TYPE) \ do { \ @@ -37,38 +52,98 @@ } while (0) namespace { -const int32_t kDefaultMatrixR0C0Yuv2Rgb = 298; -const int32_t kDefaultMatrixR0C1Yuv2Rgb = 0; -const int32_t kDefaultMatrixR0C2Yuv2Rgb = 409; -const int32_t kDefaultMatrixR1C0Yuv2Rgb = 298; -const int32_t kDefaultMatrixR1C1Yuv2Rgb = -100; -const int32_t kDefaultMatrixR1C2Yuv2Rgb = -208; -const int32_t kDefaultMatrixR2C0Yuv2Rgb = 298; -const int32_t kDefaultMatrixR2C1Yuv2Rgb = 516; -const int32_t kDefaultMatrixR2C2Yuv2Rgb = 0; -const int32_t kDefaultMatrixR0C0Rgb2Yuv = 66; -const int32_t kDefaultMatrixR0C1Rgb2Yuv = 129; -const int32_t kDefaultMatrixR0C2Rgb2Yuv = 25; -const int32_t kDefaultMatrixR1C0Rgb2Yuv = -38; -const int32_t kDefaultMatrixR1C1Rgb2Yuv = -74; -const int32_t kDefaultMatrixR1C2Rgb2Yuv = 112; -const int32_t kDefaultMatrixR2C0Rgb2Yuv = 112; -const int32_t kDefaultMatrixR2C1Rgb2Yuv = -94; -const int32_t kDefaultMatrixR2C2Rgb2Yuv = -18; -const int32_t kDefaultOutputBias0 = 16; -const int32_t kDefaultOutputBias1 = 128; -const int32_t kDefaultOutputBias2 = 128; -const int32_t kDefaultInputBias0 = 16; -const int32_t kDefaultInputBias1 = 128; -const int32_t kDefaultInputBias2 = 128; -const float kDefaultVarReciChn = 1.0; +const int32_t DEFAULT_MATRIX_R0C0_YUV2RGB = 298; +const int32_t DEFAULT_MATRIX_R0C1_YUV2RGB = 0; +const int32_t DEFAULT_MATRIX_R0C2_YUV2RGB = 409; +const int32_t DEFAULT_MATRIX_R1C0_YUV2RGB = 298; +const int32_t DEFAULT_MATRIX_R1C1_YUV2RGB = -100; +const int32_t DEFAULT_MATRIX_R1C2_YUV2RGB = -208; +const int32_t DEFAULT_MATRIX_R2C0_YUV2RGB = 298; +const int32_t DEFAULT_MATRIX_R2C1_YUV2RGB = 516; +const int32_t DEFAULT_MATRIX_R2C2_YUV2RGB = 0; +const int32_t DEFAULT_MATRIX_R0C0_RGB2YUV = 66; +const int32_t DEFAULT_MATRIX_R0C1_RGB2YUV = 129; +const int32_t DEFAULT_MATRIX_R0C2_RGB2YUV = 25; +const int32_t DEFAULT_MATRIX_R1C0_RGB2YUV = -38; +const int32_t DEFAULT_MATRIX_R1C1_RGB2YUV = -74; +const int32_t DEFAULT_MATRIX_R1C2_RGB2YUV = 112; +const int32_t DEFAULT_MATRIX_R2C0_RGB2YUV = 112; +const int32_t DEFAULT_MATRIX_R2C1_RGB2YUV = -94; +const int32_t DEFAULT_MATRIX_R2C2_RGB2YUV = -18; +const int32_t DEFAULT_OUTPUT_BIAS_0 = 16; +const int32_t DEFAULT_OUTPUT_BIAS_1 = 128; +const int32_t DEFAULT_OUTPUT_BIAS_2 = 128; +const int32_t DEFAULT_INPUT_BIAS_0 = 16; +const int32_t DEFAULT_INPUT_BIAS_1 = 128; +const int32_t DEFAULT_INPUT_BIAS_2 = 128; +const float DEFAULT_VAR_RECI_CHN = 1.0; } // namespace namespace ge { namespace { -const std::set kInsertAippExceptOp = {SHAPE, SSDPRIORBOX}; +const char *const kMbatchSwitchnName = "mbatch-switch-name"; +const char *const kAippConfigPath = "aipp_config_route"; +const char *const kCurrentAippIndex = "current_aipp_index"; +const char *const kDynamicAippData = "ascend_dynamic_aipp_data"; +const uint64_t kMinTransferShape = 3; +const int kAippImageInputIndex = 0; +const int kAippParamsInputIndex = 1; +const int kAippDataOutputIndex = 0; + +// the `format` must one NCHW or NHWC +Status GetDataDimN(const ge::NodePtr &data_node, ge::Format format, int64_t &batch) { + auto output_desc = NodeUtils::GetOutputDesc(*data_node, 0); + auto shape = output_desc.GetShape().GetDims(); + if (shape.size() == kMinTransferShape) { + batch = 1; + return SUCCESS; + } + if (shape.size() == DIM_DEFAULT_SIZE) { + switch (format) { + case FORMAT_NCHW: + batch = shape[NCHW_DIM_N]; + return SUCCESS; + case FORMAT_NHWC: + batch = shape[NHWC_DIM_N]; + return SUCCESS; + default: + GELOGE(PARAM_INVALID, "Not support data format: %s", TypeUtils::FormatToSerialString(format).c_str()); + return PARAM_INVALID; + } + } + GELOGE(PARAM_INVALID, "when dynamic aipp, shape must be in range [3, 4], but is %zu", shape.size()); + return PARAM_INVALID; } +// the batch_count must be more than 0 +int64_t CalcMaxSize(int64_t batch_count) { + batch_count--; + if (batch_count > 0) { + if (INT64_MAX / batch_count < static_cast(sizeof(kAippDynamicBatchPara))) { + return -1; + } + } + + int64_t size = batch_count * sizeof(kAippDynamicBatchPara); + if (INT64_MAX - static_cast(sizeof(kAippDynamicPara)) < size) { + return -1; + } + return size + sizeof(kAippDynamicPara); +} + +Format GetAndCheckFormat() { + switch (domi::GetContext().format) { + case domi::DOMI_TENSOR_NCHW: + return FORMAT_NCHW; + case domi::DOMI_TENSOR_NHWC: + return FORMAT_NHWC; + default: + GELOGE(PARAM_INVALID, "Unexpected format found %d", static_cast(domi::GetContext().format)); + return FORMAT_ND; + } +} +} // namespace + Status AippOp::Init(domi::AippOpParams *aipp_params) { aipp_params_ = new (std::nothrow) domi::AippOpParams(); if (aipp_params_ == nullptr) { @@ -85,53 +160,216 @@ AippOp::~AippOp() { } } -domi::AippOpParams::AippMode AippOp::GetAippMode() { - if (aipp_params_ == nullptr) { - return domi::AippOpParams::undefined; +Status AippOp::InsertAippToGraph(ComputeGraphPtr &graph, std::string &aippConfigPath, const uint32_t index) { + GE_CHECK_NOTNULL(graph); + NodePtr target_input = nullptr; + std::vector> target_edges; + GE_CHK_STATUS_RET(this->GetTargetPosition(graph, target_input, target_edges), "Get data nodes position failed"); + + std::map out_anchors_to_aipp; + for (auto &out_in_anchors : target_edges) { + auto iter = out_anchors_to_aipp.find(out_in_anchors.first); + if (iter == out_anchors_to_aipp.end()) { + auto aipp = CreateAipp(graph, out_in_anchors.first, aippConfigPath, index); + GE_CHECK_NOTNULL(aipp); + out_anchors_to_aipp[out_in_anchors.first] = aipp; + + auto ret = GraphUtils::InsertNodeBetweenDataAnchors(out_in_anchors.first, out_in_anchors.second, aipp); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to link edges for aipp node %s", aipp->GetName().c_str()); + return INTERNAL_ERROR; + } + + // add aipp data if needed + if (GetAippMode() == domi::AippOpParams::dynamic) { + ret = CreateAippData(graph, aipp); + if (ret != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to create aipp data for aipp %s data %s", aipp->GetName().c_str(), + out_in_anchors.first->GetOwnerNode()->GetName().c_str()); + return INTERNAL_ERROR; + } + } + GELOGI("Create aipp %s and insert it to the graph", aipp->GetName().c_str()); + } else { + out_in_anchors.second->UnlinkAll(); + auto &aipp = iter->second; + auto ret = out_in_anchors.second->LinkFrom(aipp->GetOutDataAnchor(0)); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to link aipp %s to the peer node %s", aipp->GetName().c_str(), + out_in_anchors.second->GetOwnerNode()->GetName().c_str()); + return INTERNAL_ERROR; + } + } } - return aipp_params_->aipp_mode(); + + return SUCCESS; } +NodePtr AippOp::CreateAipp(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_anchor, + const std::string &aippConfigPath, const uint32_t &index) { + std::string current_name = + out_anchor->GetOwnerNode()->GetName() + "_" + std::to_string(out_anchor->GetIdx()) + "_huawei_aipp"; + auto aipp_opdesc_ptr = MakeShared(current_name, AIPP); + if (aipp_opdesc_ptr == nullptr) { + GELOGE(OUT_OF_MEMORY, "Failed to alloc aipp desc, name %s", current_name.c_str()); + return nullptr; + } + + // Update attributes + GeAttrValue::NamedAttrs aipp_attr; + ConvertParamToAttr(aipp_attr); + // a useless attr but defined in IR, we use `aipp_config_route` actrually + if (!AttrUtils::SetStr(aipp_opdesc_ptr, "aipp_config_path", "./aipp.cfg")) { + GELOGE(INTERNAL_ERROR, "Set config file path attr for aipp node failed"); + return nullptr; + } + if (!AttrUtils::SetNamedAttrs(aipp_opdesc_ptr, ATTR_NAME_AIPP, aipp_attr)) { + GELOGE(INTERNAL_ERROR, "Set name attrs for aipp node failed"); + return nullptr; + } + if (!AttrUtils::SetStr(aipp_opdesc_ptr, kAippConfigPath, aippConfigPath)) { + GELOGE(INTERNAL_ERROR, "Set config file path attr for aipp node failed"); + return nullptr; + } + if (!AttrUtils::SetListStr(aipp_opdesc_ptr, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, std::vector())) { + GELOGE(INTERNAL_ERROR, "Set ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES attr for aipp node failed"); + return nullptr; + } + if (!AttrUtils::SetInt(aipp_opdesc_ptr, kCurrentAippIndex, index)) { + GELOGE(INTERNAL_ERROR, "Set kCurrentAippIndex attr for aipp node failed"); + return nullptr; + } + + // add input/output desc + GeTensorDesc tensor; + auto ret = aipp_opdesc_ptr->AddInputDesc("images", tensor); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to add input images for aipp node"); + return nullptr; + } + if (GetAippMode() == domi::AippOpParams::dynamic) { + ret = aipp_opdesc_ptr->AddOptionalInputDesc("params", tensor); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to add input params for aipp node"); + return nullptr; + } + } + ret = aipp_opdesc_ptr->AddOutputDesc("features", tensor); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to add output features for aipp node"); + return nullptr; + } + + // Update input desc, the output desc will be flushed when InferShape + auto node_desc = out_anchor->GetOwnerNode()->GetOpDesc(); + if (node_desc == nullptr) { + return nullptr; + } + auto opdesc_src_data = node_desc->GetOutputDesc(out_anchor->GetIdx()); + if (opdesc_src_data.GetDataType() != DT_FLOAT) { + GELOGW("The datatype of data node %s is not FP32", node_desc->GetName().c_str()); + opdesc_src_data.SetDataType(DT_FLOAT); + } + // We must get the TensorDesc from the output anchor on the Data node, + // and update the TensorDesc to the input anchor on the Aipp node. + // Because the InferShape function for the Aipp node needs the input tensor format, + // but the InferFormat process before InferShape can not infer the format + // if the tensor on the Aipp has an unknown shape + if (aipp_opdesc_ptr->UpdateInputDesc(kAippImageInputIndex, opdesc_src_data) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to update the output desc from node %s to aipp %s", node_desc->GetName().c_str(), + aipp_opdesc_ptr->GetName().c_str()); + return nullptr; + } + + return graph->AddNode(aipp_opdesc_ptr); +} + +domi::AippOpParams::AippMode AippOp::GetAippMode() { return aipp_params_->aipp_mode(); } + +NodePtr AippOp::FindDataByIndex(const ComputeGraphPtr &graph, int rank) { + int64_t data_index = 0; + for (auto &node : graph->GetDirectNode()) { + if (node->GetType() != DATA) { + continue; + } + // There is no `index` attribute on the `Data` node when compile in inference scene + // so we can only use the order of all `Data` nodes to infer the data index + if (data_index++ != rank) { + continue; + } + return node; + } + GELOGE(PARAM_INVALID, "Can not find the data node by index %d", rank); + return nullptr; +} +Status AippOp::GetAndCheckTarget(const ComputeGraphPtr &graph, int rank, NodePtr &target, + std::set &edge_indexes) { + auto data_node = FindDataByIndex(graph, rank); + if (data_node == nullptr) { + GELOGE(PARAM_INVALID, "Get target input node for rank %d failed", rank); + return PARAM_INVALID; + } + + if (aipp_params_->input_edge_idx_size() > 0) { + for (auto edge_index : aipp_params_->input_edge_idx()) { + edge_indexes.insert(edge_index); + } + } + + if (!edge_indexes.empty() && (*edge_indexes.rbegin() >= data_node->GetOutDataNodes().size())) { + GELOGE(PARAM_INVALID, "input_edge_idx %u should smaller than out edge size of target input %zu", + *edge_indexes.rbegin(), data_node->GetOutDataNodes().size()); + return PARAM_INVALID; + } + target = data_node; + + std::string related_node_name; + if ((GetAippMode() == domi::AippOpParams::static_) && + AttrUtils::GetStr(data_node->GetOpDesc(), kMbatchSwitchnName, related_node_name)) { + if (related_node_name.empty()) { + GELOGE(INTERNAL_ERROR, "The data node %s has switchn node flag, but the value is empty", + data_node->GetName().c_str()); + return INTERNAL_ERROR; + } + auto switchn = graph->FindNode(related_node_name); + if (switchn == nullptr) { + GELOGE(INTERNAL_ERROR, "The data node %s has switchn node %s, but can not find it on the graph", + data_node->GetName().c_str(), switchn->GetName().c_str()); + return INTERNAL_ERROR; + } + target = switchn; + GELOGI( + "Multi-batch/image size and static aipp for data %s, " + "the aipp node will be insert after %s instead of origin data node", + data_node->GetName().c_str(), switchn->GetName().c_str()); + } + + return SUCCESS; +} Status AippOp::GetTargetPosition(ComputeGraphPtr graph, NodePtr &target_input, std::vector> &target_edges) { GE_CHECK_NOTNULL(graph); - target_input = nullptr; - target_edges.clear(); - GE_CHECK_NOTNULL(aipp_params_); - const uint32_t related_input_rank = aipp_params_->related_input_rank(); - GE_CHK_STATUS_RET(GetInputNode(graph, target_input, related_input_rank), "get target input node failed"); - - const bool is_edge_configed = aipp_params_->input_edge_idx_size() > 0; - GE_CHK_BOOL_RET_STATUS(!is_edge_configed || aipp_params_->input_edge_idx(0) < target_input->GetOutDataNodes().size(), - PARAM_INVALID, "input_edge_idx %u should smaller than out edge size of target input %zu ", - aipp_params_->input_edge_idx(0), target_input->GetOutDataNodes().size()); + std::set edge_indexes; + const uint32_t related_input_rank = aipp_params_->related_input_rank(); + auto ret = GetAndCheckTarget(graph, related_input_rank, target_input, edge_indexes); + if (ret != SUCCESS) { + GELOGE(ret, "Get target input node for rank %u failed", related_input_rank); + return ret; + } - uint32_t i = 0; + target_edges.clear(); for (OutDataAnchorPtr &src_out : target_input->GetAllOutDataAnchors()) { - GE_RETURN_WITH_LOG_IF_FALSE(src_out != nullptr, "OutDataAnchor is null."); - auto vistor = src_out->GetPeerInDataAnchors(); - for (auto it = vistor.begin(); it != vistor.end(); ++it, ++i) { - InDataAnchorPtr dst_in = *it; - GE_RETURN_WITH_LOG_IF_FALSE(dst_in != nullptr, "InDataAnchor is null."); - - if ((is_edge_configed && i == aipp_params_->input_edge_idx(0)) || !is_edge_configed) { - NodePtr dst_node = dst_in->GetOwnerNode(); - OpDescPtr dst_op = dst_node->GetOpDesc(); - if (kInsertAippExceptOp.find(dst_op->GetType()) == kInsertAippExceptOp.end()) { - target_edges.push_back(make_pair(src_out, dst_in)); - continue; - } - - GE_CHK_BOOL_RET_STATUS(!is_edge_configed, PARAM_INVALID, "index %d of input node is %s node, can not do aipp", - aipp_params_->input_edge_idx(0), dst_op->GetType().c_str()); + auto dst_ins = src_out->GetPeerInDataAnchors(); + for (uint32_t i = 0; i < dst_ins.size(); ++i) { + auto dst_in = dst_ins.at(i); + if (edge_indexes.empty() || edge_indexes.count(i) > 0) { + target_edges.emplace_back(src_out, dst_in); } } } - GE_CHK_BOOL_RET_STATUS(target_edges.size() > 0, FAILED, "get target edges failed"); - return SUCCESS; } @@ -256,39 +494,39 @@ Status AippOp::ValidateParams() { void AippOp::SetCscDefaultValue() { GE_CHECK_NOTNULL_JUST_RETURN(aipp_params_); if (aipp_params_->input_format() == domi::AippOpParams::YUV420SP_U8) { - CHECK_FALSE_EXEC(aipp_params_->matrix_r0c0_size() > 0, aipp_params_->add_matrix_r0c0(kDefaultMatrixR2C0Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r0c1_size() > 0, aipp_params_->add_matrix_r0c1(kDefaultMatrixR2C1Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r0c2_size() > 0, aipp_params_->add_matrix_r0c2(kDefaultMatrixR2C2Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r1c0_size() > 0, aipp_params_->add_matrix_r1c0(kDefaultMatrixR1C0Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r1c1_size() > 0, aipp_params_->add_matrix_r1c1(kDefaultMatrixR1C1Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r1c2_size() > 0, aipp_params_->add_matrix_r1c2(kDefaultMatrixR1C2Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r2c0_size() > 0, aipp_params_->add_matrix_r2c0(kDefaultMatrixR0C0Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r2c1_size() > 0, aipp_params_->add_matrix_r2c1(kDefaultMatrixR0C1Yuv2Rgb)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r2c2_size() > 0, aipp_params_->add_matrix_r2c2(kDefaultMatrixR0C2Yuv2Rgb)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r0c0_size() > 0, aipp_params_->add_matrix_r0c0(DEFAULT_MATRIX_R2C0_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r0c1_size() > 0, aipp_params_->add_matrix_r0c1(DEFAULT_MATRIX_R2C1_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r0c2_size() > 0, aipp_params_->add_matrix_r0c2(DEFAULT_MATRIX_R2C2_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r1c0_size() > 0, aipp_params_->add_matrix_r1c0(DEFAULT_MATRIX_R1C0_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r1c1_size() > 0, aipp_params_->add_matrix_r1c1(DEFAULT_MATRIX_R1C1_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r1c2_size() > 0, aipp_params_->add_matrix_r1c2(DEFAULT_MATRIX_R1C2_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r2c0_size() > 0, aipp_params_->add_matrix_r2c0(DEFAULT_MATRIX_R0C0_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r2c1_size() > 0, aipp_params_->add_matrix_r2c1(DEFAULT_MATRIX_R0C1_YUV2RGB)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r2c2_size() > 0, aipp_params_->add_matrix_r2c2(DEFAULT_MATRIX_R0C2_YUV2RGB)); } else { - CHECK_FALSE_EXEC(aipp_params_->matrix_r0c0_size() > 0, aipp_params_->add_matrix_r0c0(kDefaultMatrixR0C0Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r0c1_size() > 0, aipp_params_->add_matrix_r0c1(kDefaultMatrixR0C1Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r0c2_size() > 0, aipp_params_->add_matrix_r0c2(kDefaultMatrixR0C2Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r1c0_size() > 0, aipp_params_->add_matrix_r1c0(kDefaultMatrixR1C0Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r1c1_size() > 0, aipp_params_->add_matrix_r1c1(kDefaultMatrixR1C1Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r1c2_size() > 0, aipp_params_->add_matrix_r1c2(kDefaultMatrixR1C2Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r2c0_size() > 0, aipp_params_->add_matrix_r2c0(kDefaultMatrixR2C0Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r2c1_size() > 0, aipp_params_->add_matrix_r2c1(kDefaultMatrixR2C1Rgb2Yuv)); - CHECK_FALSE_EXEC(aipp_params_->matrix_r2c2_size() > 0, aipp_params_->add_matrix_r2c2(kDefaultMatrixR2C2Rgb2Yuv)); - } - CHECK_FALSE_EXEC(aipp_params_->input_bias_0_size() > 0, aipp_params_->add_input_bias_0(kDefaultInputBias0)); - CHECK_FALSE_EXEC(aipp_params_->input_bias_1_size() > 0, aipp_params_->add_input_bias_1(kDefaultInputBias1)); - CHECK_FALSE_EXEC(aipp_params_->input_bias_2_size() > 0, aipp_params_->add_input_bias_2(kDefaultInputBias2)); - CHECK_FALSE_EXEC(aipp_params_->output_bias_0_size() > 0, aipp_params_->add_output_bias_0(kDefaultOutputBias0)); - CHECK_FALSE_EXEC(aipp_params_->output_bias_1_size() > 0, aipp_params_->add_output_bias_1(kDefaultOutputBias1)); - CHECK_FALSE_EXEC(aipp_params_->output_bias_2_size() > 0, aipp_params_->add_output_bias_2(kDefaultOutputBias2)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r0c0_size() > 0, aipp_params_->add_matrix_r0c0(DEFAULT_MATRIX_R0C0_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r0c1_size() > 0, aipp_params_->add_matrix_r0c1(DEFAULT_MATRIX_R0C1_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r0c2_size() > 0, aipp_params_->add_matrix_r0c2(DEFAULT_MATRIX_R0C2_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r1c0_size() > 0, aipp_params_->add_matrix_r1c0(DEFAULT_MATRIX_R1C0_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r1c1_size() > 0, aipp_params_->add_matrix_r1c1(DEFAULT_MATRIX_R1C1_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r1c2_size() > 0, aipp_params_->add_matrix_r1c2(DEFAULT_MATRIX_R1C2_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r2c0_size() > 0, aipp_params_->add_matrix_r2c0(DEFAULT_MATRIX_R2C0_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r2c1_size() > 0, aipp_params_->add_matrix_r2c1(DEFAULT_MATRIX_R2C1_RGB2YUV)); + CHECK_FALSE_EXEC(aipp_params_->matrix_r2c2_size() > 0, aipp_params_->add_matrix_r2c2(DEFAULT_MATRIX_R2C2_RGB2YUV)); + } + CHECK_FALSE_EXEC(aipp_params_->input_bias_0_size() > 0, aipp_params_->add_input_bias_0(DEFAULT_INPUT_BIAS_0)); + CHECK_FALSE_EXEC(aipp_params_->input_bias_1_size() > 0, aipp_params_->add_input_bias_1(DEFAULT_INPUT_BIAS_1)); + CHECK_FALSE_EXEC(aipp_params_->input_bias_2_size() > 0, aipp_params_->add_input_bias_2(DEFAULT_INPUT_BIAS_2)); + CHECK_FALSE_EXEC(aipp_params_->output_bias_0_size() > 0, aipp_params_->add_output_bias_0(DEFAULT_OUTPUT_BIAS_0)); + CHECK_FALSE_EXEC(aipp_params_->output_bias_1_size() > 0, aipp_params_->add_output_bias_1(DEFAULT_OUTPUT_BIAS_1)); + CHECK_FALSE_EXEC(aipp_params_->output_bias_2_size() > 0, aipp_params_->add_output_bias_2(DEFAULT_OUTPUT_BIAS_2)); } void AippOp::SetDtcDefaultValue() { GE_CHECK_NOTNULL_JUST_RETURN(aipp_params_); - CHECK_FALSE_EXEC(aipp_params_->var_reci_chn_0_size() > 0, aipp_params_->add_var_reci_chn_0(kDefaultVarReciChn)); - CHECK_FALSE_EXEC(aipp_params_->var_reci_chn_1_size() > 0, aipp_params_->add_var_reci_chn_1(kDefaultVarReciChn)); - CHECK_FALSE_EXEC(aipp_params_->var_reci_chn_2_size() > 0, aipp_params_->add_var_reci_chn_2(kDefaultVarReciChn)); + CHECK_FALSE_EXEC(aipp_params_->var_reci_chn_0_size() > 0, aipp_params_->add_var_reci_chn_0(DEFAULT_VAR_RECI_CHN)); + CHECK_FALSE_EXEC(aipp_params_->var_reci_chn_1_size() > 0, aipp_params_->add_var_reci_chn_1(DEFAULT_VAR_RECI_CHN)); + CHECK_FALSE_EXEC(aipp_params_->var_reci_chn_2_size() > 0, aipp_params_->add_var_reci_chn_2(DEFAULT_VAR_RECI_CHN)); } Status AippOp::GenerateOpDesc(OpDescPtr op_desc) { @@ -375,4 +613,71 @@ void AippOp::ConvertParamToAttr(GeAttrValue::NamedAttrs &aipp_attrs) { SAVE_AIPP_ATTR(support_rotation, GeAttrValue::BOOL); } } +Status AippOp::CreateAippData(const ComputeGraphPtr &graph, const NodePtr &aipp_node) { + GELOGD("Enter add aipp data node process."); + // get previous node, it should be DATA + auto data_node = aipp_node->GetInDataNodes().at(kAippImageInputIndex); + GE_CHECK_NOTNULL(data_node->GetOpDesc()); + + auto ori_data_format = GetAndCheckFormat(); + if (ori_data_format != FORMAT_NCHW && ori_data_format != FORMAT_NHWC) { + GELOGE(PARAM_INVALID, "when dynamic aipp, input_format must be NCHW or NHWC, but [%s] format is %s", + data_node->GetName().c_str(), TypeUtils::FormatToSerialString(ori_data_format).c_str()); + return PARAM_INVALID; + } + + int64_t batch_count = -1; + auto ret = GetDataDimN(data_node, ori_data_format, batch_count); + if (ret != ge::SUCCESS) { + GELOGE(PARAM_INVALID, "Get data_node dims and transfer to nchw_dims failed!"); + return PARAM_INVALID; + } + if (batch_count <= 0) { + GELOGE(PARAM_INVALID, "Batch count %ld is invalid", batch_count); + return PARAM_INVALID; + } + + int64_t max_dynamic_aipp_size = CalcMaxSize(batch_count); + if (max_dynamic_aipp_size < 0) { + GELOGE(PARAM_INVALID, "The dynamic aipp size is not positive."); + return PARAM_INVALID; + } + + GELOGI("Add aipp input data, batch count is %ld, max_dynamic_aipp_size is %ld", batch_count, max_dynamic_aipp_size); + std::vector input_shape_dim(1, max_dynamic_aipp_size); + GeShape input_shape(input_shape_dim); + // construct input tensor + GeTensorDesc input_tensor(input_shape, FORMAT_ND, DT_UINT8); + TensorUtils::SetReuseInput(input_tensor, false); + TensorUtils::SetSize(input_tensor, max_dynamic_aipp_size); + + // new add aipp_data ops for dynamic aipp param input + OpDescPtr op_desc_ptr_data = MakeShared(kDynamicAippData, AIPPDATA); + GE_CHECK_NOTNULL(op_desc_ptr_data); + auto stat1 = op_desc_ptr_data->AddInputDesc(input_tensor); + + GeShape output_shape(input_shape_dim); + // construct output tensor + GeTensorDesc output_tensor(output_shape, FORMAT_ND, DT_UINT8); + TensorUtils::SetReuseInput(output_tensor, false); + TensorUtils::SetSize(output_tensor, max_dynamic_aipp_size); + auto stat2 = op_desc_ptr_data->AddOutputDesc(output_tensor); + + NodePtr aipp_data_node_ptr = graph->AddNode(op_desc_ptr_data); + GE_CHECK_NOTNULL(aipp_data_node_ptr); + + // add node desc for aipp node + auto stat3 = aipp_node->GetOpDesc()->UpdateInputDesc(kAippParamsInputIndex, output_tensor); + if (stat1 != GRAPH_SUCCESS || stat2 != GRAPH_SUCCESS || stat3 != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "node process desc failed!"); + return INTERNAL_ERROR; + } + // aipp_node should have two input data but now tbe only one input + if (GraphUtils::AddEdge(aipp_data_node_ptr->GetOutDataAnchor(kAippDataOutputIndex), + aipp_node->GetInDataAnchor(kAippParamsInputIndex)) != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Add Anchor anchor between aipp data node and aipp failed!"); + return INTERNAL_ERROR; + } + return SUCCESS; +} } // namespace ge diff --git a/src/ge/graph/preprocess/insert_op/ge_aipp_op.h b/src/ge/graph/preprocess/insert_op/ge_aipp_op.h index 0b288971..61baacfd 100644 --- a/src/ge/graph/preprocess/insert_op/ge_aipp_op.h +++ b/src/ge/graph/preprocess/insert_op/ge_aipp_op.h @@ -60,20 +60,22 @@ class AippOp : public InsertOpBase { Status GetTargetPosition(ge::ComputeGraphPtr graph, ge::NodePtr &target_input, std::vector> &target_edges) override; + Status InsertAippToGraph(ge::ComputeGraphPtr &graph, std::string &aippConfigPath, const uint32_t index) override; + domi::AippOpParams::AippMode GetAippMode() override; private: AippOp &operator=(const AippOp &aipp_op); AippOp(const AippOp &aipp_op); - /// - /// @ingroup domi_omg - /// @brief Convert Param To Attr - /// void ConvertParamToAttr(ge::GeAttrValue::NamedAttrs &aipp_attrs); void SetCscDefaultValue(); - void SetDtcDefaultValue(); + NodePtr FindDataByIndex(const ComputeGraphPtr &graph, int rank); + Status GetAndCheckTarget(const ComputeGraphPtr &graph, int rank, NodePtr &target, std::set &edge_indexes); + NodePtr CreateAipp(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_anchor, + const std::string &aippConfigPath, const uint32_t &index); + Status CreateAippData(const ComputeGraphPtr &graph, const NodePtr &aipp); domi::AippOpParams *aipp_params_ = nullptr; ge::NodePtr aipp_node_ = nullptr; diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc index 668ee1e4..fbdcc217 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc @@ -21,88 +21,31 @@ #include "common/op/ge_op_utils.h" #include "common/util.h" #include "framework/common/debug/ge_log.h" +#include "framework/common/debug/log.h" #include "framework/common/ge_inner_error_codes.h" #include "framework/omg/omg_inner_types.h" -#include "graph/preprocess/insert_op/ge_aipp_op.h" #include "graph/debug/ge_attr_define.h" +#include "graph/preprocess/insert_op/ge_aipp_op.h" #include "graph/utils/attr_utils.h" #include "graph/utils/graph_utils.h" #include "graph/utils/op_desc_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "inc/common/dynamic_aipp.h" +#include "common/formats/utils/formats_trans_utils.h" using domi::AippOpParams; namespace ge { +namespace { +const char *const kMbatchSwitchnName = "mbatch-switch-name"; +} // namespace Status InsertNewOpUtil::Init() { insert_op_conf_.reset((new (std::nothrow) domi::InsertNewOps())); GE_CHECK_NOTNULL(insert_op_conf_); return SUCCESS; } -namespace { -constexpr uint64_t kMinTransferShape = 3; -constexpr int64_t kMaxBatchCountNum = 32768; - -Status ExpandDimsAndNormalizedToNCHW(ge::Format src_format, const std::vector &src_dims, - std::vector &nchw_dims) { - GELOGD("Enter ExpandDimsAndNormalizedToNCHW process!"); - // The input of 3-dimension and 4-dimension is considered as picture dimension, - // which needs to be converted according to specific format - if (src_dims.size() != DIM_DEFAULT_SIZE && src_dims.size() != kMinTransferShape) { - GELOGE(PARAM_INVALID, "expand and normalize format failed, src size [%lu] is not in range [3,4]", src_dims.size()); - return PARAM_INVALID; - } - - switch (src_format) { - case ge::FORMAT_NCHW: - if (src_dims.size() == DIM_DEFAULT_SIZE) { - nchw_dims = src_dims; - } else { - nchw_dims.push_back(1); - nchw_dims.push_back(src_dims[0]); - nchw_dims.push_back(src_dims[1]); - nchw_dims.push_back(src_dims[2]); - } - break; - case ge::FORMAT_NHWC: - if (src_dims.size() == DIM_DEFAULT_SIZE) { - nchw_dims.push_back(src_dims[NHWC_DIM_N]); - nchw_dims.push_back(src_dims[NHWC_DIM_C]); - nchw_dims.push_back(src_dims[NHWC_DIM_H]); - nchw_dims.push_back(src_dims[NHWC_DIM_W]); - } else { - nchw_dims.push_back(1); - nchw_dims.push_back(src_dims[HWC_DIM_C]); - nchw_dims.push_back(src_dims[HWC_DIM_H]); - nchw_dims.push_back(src_dims[HWC_DIM_W]); - } - break; - default: - GELOGE(PARAM_INVALID, "Not support src format: %d", src_format); - return PARAM_INVALID; - } - - return ge::SUCCESS; -} -Status GetDataOpDims(const ge::NodePtr data_node, ge::Format format, std::vector &nchw_dims) { - GELOGD("Enter GetDataOpDims process!"); - - auto data_input_desc_ptr = data_node->GetOpDesc()->GetInputDescPtr(0); // GetOpDesc() has check null before logic - if (data_input_desc_ptr == nullptr) { - GELOGE(PARAM_INVALID, "data_node's input desc object is null"); - return PARAM_INVALID; - } - auto shape = data_input_desc_ptr->GetShape().GetDims(); - if ((shape.size() < kMinTransferShape) && (shape.size() > DIM_DEFAULT_SIZE)) { - GELOGE(PARAM_INVALID, "when dynamic aipp, shape must be in range [3, 4], but is %lu", shape.size()); - return PARAM_INVALID; - } - - return ExpandDimsAndNormalizedToNCHW(format, shape, nchw_dims); -} -} // namespace Status InsertNewOpUtil::Parse(const char *conf_path) { if (conf_path == nullptr || *conf_path == '\0') { return SUCCESS; @@ -130,117 +73,10 @@ Status InsertNewOpUtil::Parse(const char *conf_path) { return SUCCESS; } -Status InsertNewOpUtil::AddAippInputData(ge::NodePtr aipp_node, ge::ComputeGraphPtr graph) { - GELOGD("Enter add aipp data node process!"); - static int index = 0; - - // get previous node, it should be DATA - auto data_node = aipp_node->GetInDataNodes().at(0); - if (data_node->GetOpDesc() == nullptr) { - GELOGE(PARAM_INVALID, "data node has no opdesc!"); - return PARAM_INVALID; - } - if (data_node->GetOpDesc()->GetType() != DATA) { - GELOGE(PARAM_INVALID, "aipp node should follow one data node, but previous node's type is %s", - data_node->GetOpDesc()->GetType().c_str()); - return PARAM_INVALID; - } - auto ori_data_format = static_cast(static_cast(domi::GetContext().format)); - if (ori_data_format != FORMAT_NCHW && ori_data_format != FORMAT_NHWC) { - GELOGE(PARAM_INVALID, "when dynamic aipp,input_format must be NCHW or NHWC, but [%s] format is %s", - data_node->GetName().c_str(), ge::TypeUtils::FormatToSerialString(ori_data_format).c_str()); - return PARAM_INVALID; - } - - std::vector nchw_dims; - auto ret = GetDataOpDims(data_node, ori_data_format, nchw_dims); - if (ret != ge::SUCCESS) { - GELOGE(PARAM_INVALID, "get data_node dims and transfer to nchw_dims failed!"); - return PARAM_INVALID; - } - - auto batch_count = nchw_dims[NCHW_DIM_N]; - // new add aipp_data ops for dynamic aipp param input - OpDescPtr op_desc_ptr_data = - ge::MakeShared(std::string("aipp_data_").append(std::to_string(index++)), AIPPDATA); - - // calc max size - if (batch_count <= 0 || batch_count > kMaxBatchCountNum) { - GELOGE(PARAM_INVALID, "batch_cout must be in range(0, %ld]", kMaxBatchCountNum); - return PARAM_INVALID; - } - uint64_t max_dynamic_aipp_size = sizeof(kAippDynamicPara) + (batch_count - 1) * sizeof(kAippDynamicBatchPara); - - GELOGI("Add aipp input data, batch count: %ld, max_dynamic_aipp_size: %ld", batch_count, max_dynamic_aipp_size); - vector input_shape_dim(1, 1); - input_shape_dim[0] = static_cast(max_dynamic_aipp_size); - GeShape input_shape(input_shape_dim); - // construct input tensor - GeTensorDesc input_tensor(input_shape, FORMAT_ND, DT_UINT8); - TensorUtils::SetReuseInput(input_tensor, false); - TensorUtils::SetSize(input_tensor, static_cast(max_dynamic_aipp_size)); - - auto stat1 = op_desc_ptr_data->AddInputDesc(input_tensor); - - GeShape output_shape(input_shape_dim); - // construct output tensor - GeTensorDesc output_tensor(output_shape, FORMAT_ND, DT_UINT8); - TensorUtils::SetReuseInput(output_tensor, false); - TensorUtils::SetSize(output_tensor, static_cast(max_dynamic_aipp_size)); - auto stat2 = op_desc_ptr_data->AddOutputDesc(output_tensor); - - NodePtr aipp_data_node_ptr = graph->AddNode(op_desc_ptr_data); - if (aipp_data_node_ptr == nullptr) { - GELOGE(INTERNAL_ERROR, "graph add node failed."); - return INTERNAL_ERROR; - } - // add node desc for aipp node - auto stat3 = aipp_node->GetOpDesc()->UpdateInputDesc(1, output_tensor); - if (stat1 != SUCCESS || stat2 != SUCCESS || stat3 != SUCCESS) { - GELOGE(INTERNAL_ERROR, "node process desc failed!"); - return INTERNAL_ERROR; - } - // aipp_node should have two input data but now tbe only one input - if (GraphUtils::AddEdge(aipp_data_node_ptr->GetOutDataAnchor(0), aipp_node->GetInDataAnchor(1)) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Add Anchor anchor between aipp data node and aipp failed!"); - return INTERNAL_ERROR; - } - - return SUCCESS; -} - -Status InsertNewOpUtil::InsertNewOps(const ComputeGraphPtr &graph) { - GE_CHECK_NOTNULL(graph); - for (auto &insert_op : insert_ops_) { - GE_CHK_STATUS_RET(insert_op->InsertOpToGraph(graph), "insert op to graph failed"); - } - - GE_CHK_STATUS_RET(CheckGraph(graph), "after inserting all ops, check graph failed"); - - GE_CHK_STATUS_RET(graph->TopologicalSorting(), "after insert dynamic op, sort graph failed"); - - ClearNewOps(); - - return SUCCESS; -} - Status InsertNewOpUtil::InsertAippOps(ComputeGraphPtr &graph, std::string &aippConfigPath) { GE_CHECK_NOTNULL(graph); - for (auto &insert_op : insert_ops_) { - AippOpParams::AippMode aipp_mode = insert_op->GetAippMode(); - ge::NodePtr aipp_node = nullptr; - GE_CHK_STATUS_RET(insert_op->InsertAippToGraph(graph, aippConfigPath, aipp_node), "insert op to graph failed"); - if (aipp_node == nullptr) { - GELOGE(FAILED, "aipp node is null!"); - return FAILED; - } - if (aipp_mode == AippOpParams::dynamic) { - Status stat = AddAippInputData(aipp_node, graph); - if (stat != SUCCESS) { - GELOGE(FAILED, "Add aipp input data failed"); - return FAILED; - } - } + for (uint32_t index = 0; index < insert_ops_.size(); ++index) { + GE_CHK_STATUS_RET(insert_ops_[index]->InsertAippToGraph(graph, aippConfigPath, index), "insert op to graph failed"); } GE_CHK_STATUS_RET(CheckGraph(graph), "after inserting all ops, check graph failed"); @@ -284,7 +120,7 @@ Status InsertNewOpUtil::CheckGraph(const ComputeGraphPtr &graph) { GE_CHECK_NOTNULL(graph); domi::AippOpParams::AippMode aippMode = domi::AippOpParams::undefined; - for (const auto &node : graph->GetAllNodes()) { + for (const auto &node : graph->GetDirectNode()) { if (node->GetType() != DATA) { continue; } @@ -294,8 +130,6 @@ Status InsertNewOpUtil::CheckGraph(const ComputeGraphPtr &graph) { for (const auto &inAnchor : anchor->GetPeerInDataAnchors()) { const std::string &nodeType = inAnchor->GetOwnerNode()->GetType(); - GE_IF_BOOL_EXEC(nodeType == SSDPRIORBOX || nodeType == SHAPE, continue;); - GE_CHK_BOOL_RET_STATUS(aippNodes.size() == 0 || nodeType == AIPP, PARAM_INVALID, "Can not config part of outputs of Data node to support AIPP, config all of the " "outputs of Data to support AIPP, or config none of them"); @@ -314,7 +148,6 @@ Status InsertNewOpUtil::CheckGraph(const ComputeGraphPtr &graph) { aippMode = (aippMode == domi::AippOpParams::undefined) ? aippParams->aipp_mode() : aippMode; GE_CHK_BOOL_RET_STATUS(aippMode == aippParams->aipp_mode(), PARAM_INVALID, "The aipp_mode of all aipp_op must be the same");); - GE_IF_BOOL_EXEC( aippNodes.size() > 1, for (decltype(aippNodes)::size_type i = 1; i < aippNodes.size(); i++) { std::unique_ptr currAippParam(new (std::nothrow) domi::AippOpParams()); @@ -351,55 +184,133 @@ Status InsertNewOpUtil::GetAippParams(const std::unique_ptr return SUCCESS; } +Status InsertNewOpUtil::UpdateDataNodeByAipp(const ComputeGraphPtr &graph) { + std::map switchn_names_to_data; + std::set updated_switchn; -Status InsertNewOpUtil::AddMultiShapeInputData(const ge::ComputeGraphPtr &graph) { - GE_CHECK_NOTNULL(graph); for (auto &node : graph->GetDirectNode()) { - GE_CHECK_NOTNULL(node); - if (node->GetOpDesc()->GetType() != MULTISHAPE) { - continue; + if (node->GetType() == DATA) { + std::string switchn_name; + if (AttrUtils::GetStr(node->GetOpDesc(), kMbatchSwitchnName, switchn_name)) { + switchn_names_to_data[switchn_name] = node; + } } - - GE_CHK_BOOL_RET_STATUS(node->GetInDataNodes().size() == 1, FAILED, - "multi_shape node should follow one data node, but size of input edges is %d", - node->GetInDataNodes().size()); - - NodePtr dataNode = node->GetInDataNodes().at(0); - GE_CHK_BOOL_RET_STATUS(dataNode->GetOpDesc()->GetType() == DATA, FAILED, - "multi_shape node should follow one data node, but previous node's type is %s", - dataNode->GetOpDesc()->GetType().c_str()); - - OpDescPtr opDescPtrData = MakeShared(std::string("multi_shape_data"), DATA); - if (opDescPtrData == nullptr) { - return PARAM_INVALID; + if (node->GetType() == AIPP) { + GE_RETURN_IF_ERROR(UpdatePrevNodeByAipp(node, updated_switchn)); } + } - const uint32_t shapeSize = 4; - const uint32_t REALDIM_CNT = 4; - - vector inputShapeDim(4, 1); // 4 dimensions: NCHW - inputShapeDim[0] = shapeSize; - - GeShape inputShape(inputShapeDim); - GeTensorDesc input_tensor(inputShape, FORMAT_NCHW, DT_UINT32); - TensorUtils::SetReuseInput(input_tensor, false); - TensorUtils::SetSize(input_tensor, shapeSize * sizeof(uint32_t)); - GE_CHK_STATUS_RET(opDescPtrData->AddInputDesc(input_tensor)); - - GeShape outputShape(inputShapeDim); - GeTensorDesc output_tensor(outputShape, FORMAT_NCHW, DT_UINT32); - TensorUtils::SetReuseInput(output_tensor, false); - TensorUtils::SetSize(output_tensor, shapeSize * sizeof(uint32_t)); - TensorUtils::SetRealDimCnt(output_tensor, REALDIM_CNT); + for (auto &switchn : updated_switchn) { + auto data_iter = switchn_names_to_data.find(switchn->GetName()); + if (data_iter == switchn_names_to_data.end()) { + GELOGE(INTERNAL_ERROR, "Failed to find relative data node by switchn %s", switchn->GetName().c_str()); + return INTERNAL_ERROR; + } + GE_RETURN_IF_ERROR(UpdateDataBySwitchN(switchn, data_iter->second)); + } - GE_CHK_STATUS_RET(opDescPtrData->AddOutputDesc(output_tensor), "AddOutputDesc failed!"); + return SUCCESS; +} +Status InsertNewOpUtil::UpdatePrevNodeByAipp(NodePtr &node, std::set &switchns) { + GELOGI("Start to update prev node size by aipp %s.", node->GetName().c_str()); + auto aipp_op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(aipp_op_desc); + auto aipp_input = aipp_op_desc->MutableInputDesc(0); + GE_CHECK_NOTNULL(aipp_input); + + int64_t size = 0; + graphStatus graph_ret = ge::TensorUtils::GetSize(*aipp_input, size); + if (graph_ret != GRAPH_SUCCESS) { + GELOGE(FAILED, "UpdateOutputDesc fail, graph_ret:%d", graph_ret); + return FAILED; + } + GELOGI("Get size [%ld] from aipp [%s].", size, aipp_op_desc->GetName().c_str()); + if (size == 0) { + GELOGE(FAILED, "Can not get size from aipp [%s]", aipp_op_desc->GetName().c_str()); + return FAILED; + } - NodePtr shapeDataNodePtr = graph->AddNode(opDescPtrData); - GE_CHECK_NOTNULL(shapeDataNodePtr); - GE_CHK_STATUS_RET(GraphUtils::AddEdge(shapeDataNodePtr->GetOutDataAnchor(0), node->GetInDataAnchor(1)), - "Add Anchor anchor between shape data and multi_shape failed!"); + auto in_data_anchor = node->GetInDataAnchor(0); + GE_CHECK_NOTNULL(in_data_anchor); + auto peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_anchor); + const auto &src_node = peer_out_anchor->GetOwnerNode(); + const auto &src_op = src_node->GetOpDesc(); + GE_CHECK_NOTNULL(src_op); + + // if the type of src_node is SwitchN, the input of it may be updated to a size not the max one + // the correct size will be updated in function `UpdateDataBySwitchN` + DataType aipp_dt = aipp_input->GetDataType(); + aipp_input->SetOriginDataType(aipp_dt); + DataType aipp_origni_dt = aipp_input->GetOriginDataType(); + GeShape aipp_shape = aipp_input->GetShape(); + GELOGI("Aipp [%s] input datatype is %s, origin datatype is %s, input shape is %s", aipp_op_desc->GetName().c_str(), + TypeUtils::DataTypeToSerialString(aipp_dt).c_str(), TypeUtils::DataTypeToSerialString(aipp_origni_dt).c_str(), + ge::formats::ShapeToString(aipp_shape.GetDims()).c_str()); + + const GeTensorDescPtr &input = src_op->MutableInputDesc(0); + GE_CHECK_NOTNULL(input); + input->SetDataType(aipp_dt); + input->SetOriginDataType(aipp_origni_dt); + input->SetShape(aipp_shape); + input->SetOriginShape(aipp_shape); + ge::TensorUtils::SetSize(*input, size); + + const GeTensorDescPtr &output = src_op->MutableOutputDesc(peer_out_anchor->GetIdx()); + GE_CHECK_NOTNULL(output); + output->SetDataType(aipp_dt); + output->SetOriginDataType(aipp_origni_dt); + output->SetShape(aipp_shape); + output->SetOriginShape(aipp_shape); + ge::TensorUtils::SetSize(*output, size); + if (src_node->GetType() == SWITCHN) { + switchns.insert(src_node); } + GELOGI("Set node %s output %d size %ld by aipp.", src_node->GetName().c_str(), peer_out_anchor->GetIdx(), size); return SUCCESS; } +Status InsertNewOpUtil::UpdateDataBySwitchN(const NodePtr &switchn, const NodePtr &data) { + size_t max_index = switchn->GetOpDesc()->GetOutputsSize(); + int64_t max_size = 0; + for (size_t i = 0; i < switchn->GetOpDesc()->GetOutputsSize(); ++i) { + int64_t size = 0; + auto output_desc = switchn->GetOpDesc()->MutableOutputDesc(i); + GE_CHECK_NOTNULL(output_desc); + if (TensorUtils::GetSize(*output_desc, size) == GRAPH_SUCCESS) { + if (max_size < size) { + max_size = size; + max_index = i; + } + } + } + if (max_index >= switchn->GetOpDesc()->GetOutputsSize()) { + GELOGE(INTERNAL_ERROR, "No max size found from switchn node %s", switchn->GetName().c_str()); + return INTERNAL_ERROR; + } + auto output_desc = switchn->GetOpDesc()->MutableOutputDesc(max_index); + auto input_desc = switchn->GetOpDesc()->MutableInputDesc(0); + GE_CHECK_NOTNULL(input_desc); + input_desc->SetDataType(output_desc->GetDataType()); + input_desc->SetOriginDataType(output_desc->GetOriginDataType()); + input_desc->SetShape(output_desc->GetShape()); + input_desc->SetOriginShape(output_desc->GetOriginShape()); + TensorUtils::SetSize(*input_desc, max_size); + + auto data_opdesc = data->GetOpDesc(); + GE_CHECK_NOTNULL(data_opdesc); + auto ret = data_opdesc->UpdateOutputDesc(0, *input_desc); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to update data %s output using switchn %s", data->GetName().c_str(), + switchn->GetName().c_str()); + return INTERNAL_ERROR; + } + ret = data_opdesc->UpdateInputDesc(0, *input_desc); + if (ret != GRAPH_SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to update data %s input using switchn %s", data->GetName().c_str(), + switchn->GetName().c_str()); + return INTERNAL_ERROR; + } + return SUCCESS; +} } // namespace ge diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h index 689e66e4..70b57597 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.h @@ -38,30 +38,29 @@ class InsertNewOpUtil { Status Parse(const char *conf_path); - Status InsertNewOps(const ge::ComputeGraphPtr &graph); - Status InsertAippOps(ge::ComputeGraphPtr &graph, std::string &aippConfigPath); void ClearNewOps(); + Status UpdateDataNodeByAipp(const ComputeGraphPtr &graph); + private: Status CheckPositionNotRepeat(); - Status AddMultiShapeInputData(const ge::ComputeGraphPtr &graph); - Status GetAippParams(const std::unique_ptr &aippParams, const ge::NodePtr &aipp_node); Status CheckGraph(const ge::ComputeGraphPtr &graph); - InsertNewOpUtil() {} - - Status AddAippInputData(ge::NodePtr aipp_node, ge::ComputeGraphPtr graph); + InsertNewOpUtil() = default; ~InsertNewOpUtil() = default; std::vector> insert_ops_; std::unique_ptr insert_op_conf_; + + Status UpdatePrevNodeByAipp(NodePtr &node, std::set &switchns); + Status UpdateDataBySwitchN(const NodePtr &switchn, const NodePtr &data); }; } // namespace ge diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc index 3a4137ea..b93b02f9 100644 --- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc +++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc @@ -17,31 +17,33 @@ #include "graph/preprocess/multi_batch_copy_graph.h" #include -#include #include +#include -#include "graph/ge_context.h" -#include "graph/utils/graph_utils.h" -#include "graph/utils/node_utils.h" -#include "graph/utils/attr_utils.h" -#include "graph/debug/ge_attr_define.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "common/ge/ge_util.h" +#include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" -#include "framework/common/types.h" #include "framework/common/string_util.h" -#include "framework/common/debug/ge_log.h" -#include "common/ge/ge_util.h" -#include "common/formats/utils/formats_trans_utils.h" +#include "framework/common/types.h" +#include "framework/omg/omg_inner_types.h" +#include "graph/debug/ge_attr_define.h" +#include "graph/ge_context.h" #include "graph/passes/prune_pass.h" +#include "graph/utils/attr_utils.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/node_utils.h" namespace ge { namespace multibatch { namespace { +const char *const kMbatchSwitchnName = "mbatch-switch-name"; const int kSwitchNDataIndex = 0; const int kSwitchNPredIndex = 1; const int kDataOutIndex = 0; const int kDataInIndex = 0; const int kMergeDataOutIndex = 0; -const size_t kMaxShapesCount = 16; +const size_t kMaxShapesCount = 100; const size_t kMinShapesCount = 2; inline bool IsDataLikeType(const std::string &node_type) { return (node_type == DATA) || (node_type == AIPP); } @@ -57,18 +59,16 @@ NodePtr InsertMergeNodeToGraph(const std::string &name, size_t input_num, const GeTensorDesc tensor_desc; for (size_t i = 0; i < input_num; ++i) { auto ret = desc->AddInputDesc("x" + std::to_string(i), tensor_desc); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add input %zu, error-code %u", name.c_str(), i, - ret); - return nullptr; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add input %zu, error-code %u", + name.c_str(), i, ret); + return nullptr); } auto ret = desc->AddOutputDesc("y", tensor_desc); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add output 'y', error-code %u", name.c_str(), - ret); - return nullptr; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + GELOGE(INTERNAL_ERROR, "Failed to create merge node %s, failed to add output 'y', error-code %u", + name.c_str(), ret); + return nullptr); tensor_desc.SetDataType(DT_INT32); ret = desc->AddOutputDesc("value_index", tensor_desc); if (ret != GRAPH_SUCCESS) { @@ -86,34 +86,33 @@ NodePtr InsertMergeNodeToGraph(const std::string &name, size_t input_num, const NodePtr InsertCopyNode(const NodePtr &node, const std::string &name) { auto src_op_desc = node->GetOpDesc(); - if (src_op_desc == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to copy node %s to %s, the OpDesc is null", node->GetName().c_str(), name.c_str()); - return nullptr; - } + GE_IF_BOOL_EXEC(src_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "Failed to copy node %s to %s, the OpDesc is null", + node->GetName().c_str(), name.c_str()); + return nullptr); + auto desc = AttrUtils::CopyOpDesc(src_op_desc); - if (desc == nullptr) { - GELOGE(OUT_OF_MEMORY, "Failed to create op desc for copy node for node %s name %s", node->GetName().c_str(), - name.c_str()); - return nullptr; - } + GE_IF_BOOL_EXEC(desc == nullptr, GELOGE(OUT_OF_MEMORY, "Failed to create op desc for copy node for node %s name %s", + node->GetName().c_str(), name.c_str()); + return nullptr); + desc->SetName(name); desc->CopyAttrsFrom(*src_op_desc); for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) { auto input_desc = desc->MutableInputDesc(i); - if (input_desc == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to get input desc by index %u from node %s when copy from %s", i, - desc->GetName().c_str(), node->GetName().c_str()); - return nullptr; - } + GE_IF_BOOL_EXEC(input_desc == nullptr, + GELOGE(INTERNAL_ERROR, "Failed to get input desc by index %u from node %s when copy from %s", i, + desc->GetName().c_str(), node->GetName().c_str()); + return nullptr); + input_desc->CopyAttrsFrom(src_op_desc->GetInputDesc(i)); } for (uint32_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) { auto output_desc = desc->MutableOutputDesc(i); - if (output_desc == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to get output desc by index %u from node %s when copy from %s", i, - desc->GetName().c_str(), node->GetName().c_str()); - return nullptr; - } + GE_IF_BOOL_EXEC(output_desc == nullptr, + GELOGE(INTERNAL_ERROR, "Failed to get output desc by index %u from node %s when copy from %s", i, + desc->GetName().c_str(), node->GetName().c_str()); + return nullptr); + output_desc->CopyAttrsFrom(src_op_desc->GetOutputDesc(i)); } auto graph = node->GetOwnerComputeGraph(); @@ -242,8 +241,8 @@ Status MultiBatchGraphCopyer::Init() { if (ret != SUCCESS) { return ret; } - auto tmp_all_nodes = graph_->GetAllNodes(); - for (auto &node : tmp_all_nodes) { + + for (auto &node : graph_->GetAllNodes()) { origin_all_nodes_.emplace_back(node); if (IsDataLikeType(node->GetType())) { origin_data_nodes_.emplace_back(node); @@ -322,12 +321,11 @@ NodePtr MultiBatchGraphCopyer::InsertMergeNode(const NodePtr &node, int index) { return merge_nodes[index]; } - auto merge_node_name = node->GetName() + "_huawei_mbatch_merge_" + std::to_string(index); + auto merge_node_name = node->GetName() + "_ascend_mbatch_merge_" + std::to_string(index); auto merge_node = InsertMergeNodeToGraph(merge_node_name, shapes_.size(), node->GetOwnerComputeGraph()); - if (merge_node == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to create merge node for node %s, out index %d", node->GetName().c_str(), index); - return nullptr; - } + GE_IF_BOOL_EXEC(merge_node == nullptr, GELOGE(INTERNAL_ERROR, "Failed to create merge node for node %s, out index %d", + node->GetName().c_str(), index); + return nullptr); merge_nodes[index] = merge_node; GELOGI("Create merge node %s for node %s index %d", merge_node_name.c_str(), node->GetName().c_str(), index); return merge_node; @@ -428,7 +426,7 @@ NodePtr MultiBatchGraphCopyer::InsertShapeDataNode() { GELOGE(OUT_OF_MEMORY, "Failed to create shape data node, out of memory"); return nullptr; } - desc->SetName("huawei_mbatch_shape_data"); + desc->SetName("ascend_mbatch_shape_data"); desc->SetType(DATA); GeTensorDesc tensor_desc; @@ -481,7 +479,8 @@ Status MultiBatchGraphCopyer::CheckArguments() { size_t shape_size = shapes_.at(0).size(); for (auto &shape : shapes_) { if (shape_size != shape.size()) { - GELOGE(PARAM_INVALID, "All batch shapes size must be the same"); + GELOGE(PARAM_INVALID, "All batch shapes size must be the same, first group's size is %zu and another's is %zu.", + shape_size, shape.size()); return PARAM_INVALID; } for (auto dim : shape) { @@ -522,11 +521,10 @@ Status MultiBatchGraphCopyer::LinkDataToMerge(const NodePtr &data, const NodePtr switchn->GetName().c_str()); for (size_t i = 0; i < shapes_.size(); ++i) { auto ret = GraphUtils::AddEdge(switchn->GetOutDataAnchor(i), merge->GetInDataAnchor(i)); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to add edge between switchn %s(%zu) to merge %s(%zu), error-code %u", - switchn->GetName().c_str(), i, merge->GetName().c_str(), i, ret); - return INTERNAL_ERROR; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, + GELOGE(INTERNAL_ERROR, "Failed to add edge between switchn %s(%zu) to merge %s(%zu), error-code %u", + switchn->GetName().c_str(), i, merge->GetName().c_str(), i, ret); + return INTERNAL_ERROR); } return SUCCESS; } @@ -548,17 +546,16 @@ Status MultiBatchGraphCopyer::LinkNodeToMerge(const NodePtr &node, int out_index GELOGI("The node %s on the batch branch edge does not have any data output, create a const %s for it", src_node->GetName().c_str(), const_name.c_str()); auto const_node = InsertConst(const_name, graph_); - if (const_node == nullptr) { - GELOGE(OUT_OF_MEMORY, "Failed to create const for node %s to connect to a merge node", - src_node->GetName().c_str()); - return OUT_OF_MEMORY; - } + GE_IF_BOOL_EXEC(const_node == nullptr, + GELOGE(OUT_OF_MEMORY, "Failed to create const for node %s to connect to a merge node", + src_node->GetName().c_str()); + return OUT_OF_MEMORY); + auto ret = GraphUtils::AddEdge(src_node->GetOutControlAnchor(), const_node->GetInControlAnchor()); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", src_node->GetName().c_str(), - const_node->GetName().c_str()); - return INTERNAL_ERROR; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", + src_node->GetName().c_str(), const_node->GetName().c_str()); + return INTERNAL_ERROR); + src_node = const_node; } auto ret = GraphUtils::AddEdge(src_node->GetOutDataAnchor(out_index), merge->GetInDataAnchor(i)); @@ -624,13 +621,17 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) { GELOGE(OUT_OF_MEMORY, "Failed to create switchn for data %s", data->GetName().c_str()); return OUT_OF_MEMORY; } - switchn_desc->SetName(data->GetName() + "_huawei_mbatch_switchn"); + switchn_desc->SetName(data->GetName() + "_ascend_mbatch_switchn"); switchn_desc->SetType(SWITCHN); GeTensorDesc tensor(NodeUtils::GetOutputDesc(*data, kDataOutIndex)); - switchn_desc->AddInputDesc(tensor); // data + if (switchn_desc->AddInputDesc(tensor) != GRAPH_SUCCESS) { // data + return OUT_OF_MEMORY; + } GeTensorDesc pred_tensor; - switchn_desc->AddInputDesc(pred_tensor); // pred + if (switchn_desc->AddInputDesc(pred_tensor) != GRAPH_SUCCESS) { // pred + return OUT_OF_MEMORY; + } for (size_t i = 0; i < shapes_.size(); ++i) { auto shape = data_shape; auto ret = CalcShape(shapes_.at(i), shape); @@ -644,7 +645,10 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) { GELOGE(INTERNAL_ERROR, "Failed to add attr value on output %zu tensor", i); return INTERNAL_ERROR; } - switchn_desc->AddOutputDesc(tensor); + if (switchn_desc->AddOutputDesc(tensor) != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Opdesc AddOutputDesc failed"); + return GRAPH_FAILED; + } GELOGD("The SwitchN %s output index %zu, shape %s", switchn_desc->GetName().c_str(), i, shape.ToString().c_str()); } @@ -652,6 +656,10 @@ Status MultiBatchGraphCopyer::InsertSwitchNForData(const NodePtr &data) { GELOGE(INTERNAL_ERROR, "Failed to add insert attr on switchn node %s", switchn_desc->GetName().c_str()); return INTERNAL_ERROR; } + if (!AttrUtils::SetStr(data->GetOpDesc(), kMbatchSwitchnName, switchn_desc->GetName())) { + GELOGE(INTERNAL_ERROR, "Failed to add switchn attr on data node %s", data->GetName().c_str()); + return INTERNAL_ERROR; + } auto switchn = graph_->AddNode(switchn_desc); if (switchn == nullptr) { @@ -693,7 +701,7 @@ Status MultiBatchGraphCopyer::InsertMergeForEdgeNode(const NodePtr &node) { Status MultiBatchGraphCopyer::CopyNodeInBatchBranch(const NodePtr &node) { auto ©ed_nodes = nodes_to_batch_nodes_[node.get()]; for (size_t i = 0; i < shapes_.size(); ++i) { - auto copyed_node = InsertCopyNode(node, node->GetName() + "_huawei_mbatch_batch_" + std::to_string(i)); + auto copyed_node = InsertCopyNode(node, node->GetName() + "_ascend_mbatch_batch_" + std::to_string(i)); if (copyed_node == nullptr) { GELOGE(INTERNAL_ERROR, "Failed to add node to graph when copy node %s", node->GetName().c_str()); return INTERNAL_ERROR; @@ -734,17 +742,14 @@ Status MultiBatchGraphCopyer::LinkDataToSwitchN(const NodePtr &data) { auto switchn = data_nodes_to_switchn_[data.get()]; auto ret = GraphUtils::AddEdge(shape_data_->GetOutDataAnchor(kDataOutIndex), switchn->GetInDataAnchor(kSwitchNPredIndex)); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to link shape data %s to switchn %s", shape_data_->GetName().c_str(), - switchn->GetName().c_str()); - return INTERNAL_ERROR; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link shape data %s to switchn %s", + shape_data_->GetName().c_str(), switchn->GetName().c_str()); + return INTERNAL_ERROR); ret = GraphUtils::AddEdge(data->GetOutDataAnchor(kDataOutIndex), switchn->GetInDataAnchor(kSwitchNDataIndex)); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to link data %s to switchn %s", data->GetName().c_str(), switchn->GetName().c_str()); - return INTERNAL_ERROR; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link data %s to switchn %s", + data->GetName().c_str(), switchn->GetName().c_str()); + return INTERNAL_ERROR); return SUCCESS; } Status MultiBatchGraphCopyer::LinkToMerge(const NodePtr &node) { @@ -843,18 +848,18 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) { in_node->GetName().c_str(), node->GetName().c_str()); return INTERNAL_ERROR; } + + GE_IF_BOOL_EXEC(in_node->GetOutControlAnchor() == nullptr, + GELOGE(INTERNAL_ERROR, "Innode outputControlAnchor is null"); + return INTERNAL_ERROR); auto ret = in_node->GetOutControlAnchor()->Unlink(node->GetInControlAnchor()); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to unlink the control edge from %s to %s", in_node->GetName().c_str(), - node->GetName().c_str()); - return INTERNAL_ERROR; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to unlink the control edge from %s to %s", + in_node->GetName().c_str(), node->GetName().c_str()); + return INTERNAL_ERROR); ret = GraphUtils::AddEdge(merge_node->GetOutControlAnchor(), node->GetInControlAnchor()); - if (ret != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", merge_node->GetName().c_str(), - node->GetName().c_str()); - return INTERNAL_ERROR; - } + GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add control edge from %s to %s", + merge_node->GetName().c_str(), node->GetName().c_str()); + return INTERNAL_ERROR); GELOGI("Link control edge from merge %s(from %s) to %s", merge_node->GetName().c_str(), in_node->GetName().c_str(), node->GetName().c_str()); } @@ -865,22 +870,30 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) { Status ProcessMultiBatch(ComputeGraphPtr &graph) { const int kDecimal = 10; std::vector> shapes; - std::string option; - if (GetContext().GetOption("ge.dynamic_batchsize", option) == GRAPH_SUCCESS) { - GELOGD("Found dynamic batch option, value %s", option.c_str()); - std::vector dims = StringUtils::Split(option, ','); + if (!domi::GetContext().dynamic_batch_size.empty()) { + GELOGD("Found dynamic batch option, value %s", domi::GetContext().dynamic_batch_size.c_str()); + std::vector dims = ge::StringUtils::Split(domi::GetContext().dynamic_batch_size, ','); for (const auto &dim : dims) { + if (dim.empty()) { + continue; + } shapes.emplace_back(std::vector({std::strtol(dim.c_str(), nullptr, kDecimal)})); GELOGI("Found dynamic batch, shape %s", formats::JoinToString(*shapes.rbegin()).c_str()); } } - if (GetContext().GetOption("ge.dynamic_imagesize", option) == GRAPH_SUCCESS) { - GELOGD("Found dynamic image size option, value %s", option.c_str()); - std::vector shape_strs = StringUtils::Split(option, ';'); + if (!domi::GetContext().dynamic_image_size.empty()) { + GELOGD("Found dynamic image size option, value %s", domi::GetContext().dynamic_image_size.c_str()); + std::vector shape_strs = ge::StringUtils::Split(domi::GetContext().dynamic_image_size, ';'); for (const auto &shape_str : shape_strs) { + if (shape_str.empty()) { + continue; + } std::vector shape; - std::vector dims = StringUtils::Split(shape_str, ','); + std::vector dims = ge::StringUtils::Split(shape_str, ','); for (const auto &dim : dims) { + if (dim.empty()) { + continue; + } shape.emplace_back(std::strtol(dim.c_str(), nullptr, kDecimal)); } shapes.emplace_back(shape); @@ -888,7 +901,7 @@ Status ProcessMultiBatch(ComputeGraphPtr &graph) { } } if (shapes.empty()) { - GELOGD("There is no multi-batch options, no need to process multi-batch copys"); + GELOGD("There is no multi-batch options, no need to process multi-batch copy"); return SUCCESS; } diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.h b/src/ge/graph/preprocess/multi_batch_copy_graph.h index 7421469e..b3642dbd 100644 --- a/src/ge/graph/preprocess/multi_batch_copy_graph.h +++ b/src/ge/graph/preprocess/multi_batch_copy_graph.h @@ -16,9 +16,9 @@ #ifndef GE_GRAPH_PREPROCESS_MULTI_BATCH_COPY_GRAPH_H_ #define GE_GRAPH_PREPROCESS_MULTI_BATCH_COPY_GRAPH_H_ -#include #include #include +#include #include "external/ge/ge_api_error_codes.h" @@ -55,19 +55,18 @@ class MultiBatchGraphCopyer { Status UpdateMaxShapeToData(const NodePtr &data); Status InsertMergeForEdgeNode(const NodePtr &node); - /** - * Insert a merge node for src node `node` on output index `index`. The merge node will be used to merge all nodes - * in batch-branch to one output to the node out of the batch-branch. - * Cond 1: If the `index` is -1, then the src node link a data edge(at output 0) to the merge node, - * Cond 2: In condition 1, if the src node does not have any data output, we create a const node after it, - * the result like this: - * src_node ---------> const_for_src_node --------> merge - * control data - * Cond 3: If the src node is a data-like node, the SwitchN after it will be link to the merge node. - * @param node - * @param index - * @return - */ + + /// Insert a merge node for src node `node` on output index `index`. The merge node will be used to merge all nodes + /// in batch-branch to one output to the node out of the batch-branch. + /// Cond 1: If the `index` is -1, then the src node link a data edge(at output 0) to the merge node, + /// Cond 2: In condition 1, if the src node does not have any data output, we create a const node after it, + /// the result like this: + /// src_node ---------> const_for_src_node --------> merge + /// control data + /// Cond 3: If the src node is a data-like node, the SwitchN after it will be link to the merge node. + /// @param node + /// @param index + /// @return NodePtr InsertMergeNode(const NodePtr &node, int index); Status CopyNodeInBatchBranch(const NodePtr &node); diff --git a/src/ge/inc/node_pass.h b/src/ge/inc/node_pass.h new file mode 100644 index 00000000..4334c50d --- /dev/null +++ b/src/ge/inc/node_pass.h @@ -0,0 +1,66 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_INC_NODE_PASS_H_ +#define GE_INC_NODE_PASS_H_ + +#include +#include "common/op/ge_op_utils.h" +#include "graph/compute_graph.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/op_desc.h" +#include "graph/range_vistor.h" +#include "graph/utils/attr_utils.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/op_desc_utils.h" +#include "graph/utils/tensor_utils.h" +#include "inc/pass.h" +namespace ge { +/// +/// @ingroup domi_omg +/// @brief node pass +/// @author +/// +class NodePass : public Pass { + public: + /// + /// run node pass + /// @param [in] node node to be optimized + /// @return SUCCESS optimized successfully + /// @return TO_BE_DELETED optimized successfully and the node need to be deleted + /// @return NOT_CHANGED not optimized + /// @return others optimize failed + /// @author + /// + virtual Status Run(ge::NodePtr node) = 0; + + /// Optimize to weight, Set the "is input const" flag of the output node to true + /// @param [in] node node to be optimized + /// @return SUCCESS optimized successfully + /// @return others optimize failed + /// + Status SetOutNodeWeightDef(ge::NodePtr node, std::vector &v_weight); + + /// Update node connection relationship + /// @param [in] node The node to be optimized + /// @return SUCCESS Optimized successfully + /// @return FAILED Optimization failure + /// + Status UpdateNodeInfo(ge::NodePtr node); +}; +} // namespace ge +#endif // GE_INC_NODE_PASS_H_ diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc index 4fa2664f..a9ef96c1 100644 --- a/src/ge/init/gelib.cc +++ b/src/ge/init/gelib.cc @@ -15,27 +15,29 @@ */ #include "init/gelib.h" -#include + #include +#include #include #include #include #include #include -#include "framework/common/debug/ge_log.h" -#include "common/ge/plugin_manager.h" + #include "common/ge/ge_util.h" +#include "common/ge/plugin_manager.h" #include "common/profiling/profiling_manager.h" -#include "graph/manager/graph_mem_allocator.h" -#include "graph/manager/graph_var_manager.h" -#include "runtime/kernel.h" +#include "common/properties_manager.h" +#include "framework/common/debug/ge_log.h" +#include "ge/ge_api_types.h" +#include "ge_local_engine/engine/host_cpu_engine.h" #include "graph/ge_context.h" #include "graph/ge_global_options.h" -#include "ge/ge_api_types.h" -#include #include "graph/load/new_model_manager/model_manager.h" +#include "graph/manager/graph_mem_allocator.h" +#include "graph/manager/graph_var_manager.h" #include "omm/csa_interact.h" -#include "common/properties_manager.h" +#include "runtime/kernel.h" using Json = nlohmann::json; @@ -76,43 +78,63 @@ Status GELib::InnerInitialize(const map &options) { } GELOGI("GE System initial."); - Status init_system_status = SystemInitialize(options); - if (init_system_status != SUCCESS) { - GELOGE(init_system_status); + GE_TIMESTAMP_START(SystemInitialize); + Status initSystemStatus = SystemInitialize(options); + GE_TIMESTAMP_END(SystemInitialize, "InnerInitialize::SystemInitialize"); + if (initSystemStatus != SUCCESS) { + GELOGE(initSystemStatus); RollbackInit(); - return init_system_status; + return initSystemStatus; } GELOGI("engineManager initial."); - Status init_em_status = engine_manager_.Initialize(options); - if (init_em_status != SUCCESS) { - GELOGE(init_em_status); + GE_TIMESTAMP_START(EngineInitialize); + Status initEmStatus = engineManager_.Initialize(options); + GE_TIMESTAMP_END(EngineInitialize, "InnerInitialize::EngineInitialize"); + if (initEmStatus != SUCCESS) { + GELOGE(initEmStatus); RollbackInit(); - return init_em_status; + return initEmStatus; } GELOGI("opsManager initial."); - Status init_ops_status = ops_manager_.Initialize(options); - if (init_ops_status != SUCCESS) { - GELOGE(init_ops_status); + GE_TIMESTAMP_START(OpsManagerInitialize); + Status initOpsStatus = opsManager_.Initialize(options); + GE_TIMESTAMP_END(OpsManagerInitialize, "InnerInitialize::OpsManagerInitialize"); + if (initOpsStatus != SUCCESS) { + GELOGE(initOpsStatus); RollbackInit(); - return init_ops_status; + return initOpsStatus; } GELOGI("sessionManager initial."); - Status init_sm_status = session_manager_.Initialize(options); - if (init_sm_status != SUCCESS) { - GELOGE(init_sm_status); + GE_TIMESTAMP_START(SessionManagerInitialize); + Status initSmStatus = sessionManager_.Initialize(options); + GE_TIMESTAMP_END(SessionManagerInitialize, "InnerInitialize::SessionManagerInitialize"); + if (initSmStatus != SUCCESS) { + GELOGE(initSmStatus); RollbackInit(); - return init_sm_status; + return initSmStatus; } GELOGI("memoryMallocSize initial."); - Status init_mem_status = VarManager::Instance(0)->SetMemoryMallocSize(options); - if (init_mem_status != SUCCESS) { - GELOGE(init_mem_status, "failed to set malloc size"); + GE_TIMESTAMP_START(SetMemoryMallocSize); + Status initMemStatus = VarManager::Instance(0)->SetMemoryMallocSize(options); + GE_TIMESTAMP_END(SetMemoryMallocSize, "InnerInitialize::SetMemoryMallocSize"); + if (initMemStatus != SUCCESS) { + GELOGE(initMemStatus, "failed to set malloc size"); + RollbackInit(); + return initMemStatus; + } + + GELOGI("Start to initialize HostCpuEngine"); + GE_TIMESTAMP_START(HostCpuEngineInitialize); + Status initHostCpuEngineStatus = HostCpuEngine::GetInstance().Initialize(); + GE_TIMESTAMP_END(HostCpuEngineInitialize, "InnerInitialize::HostCpuEngineInitialize"); + if (initHostCpuEngineStatus != SUCCESS) { + GELOGE(initHostCpuEngineStatus, "Failed to initialize HostCpuEngine"); RollbackInit(); - return init_mem_status; + return initHostCpuEngineStatus; } init_flag_ = true; @@ -145,6 +167,12 @@ Status GELib::SystemInitialize(const map &options) { PropertiesManager::Instance().AddDumpPropertyValue(DUMP_ALL_MODEL, {}); PropertiesManager::Instance().SetDumpOutputPath(dump_path); } + auto step_iter = options.find(OPTION_EXEC_DUMP_STEP); + if (step_iter != options.end()) { + std::string dump_step = step_iter->second; + GELOGD("Get dump step %s successfully", dump_step.c_str()); + PropertiesManager::Instance().SetDumpStep(dump_step); + } } if (is_train_mode_) { @@ -230,7 +258,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret); options.physical_device_id = options.device_id; - // The physical ID is transferred to the logical ID. FMK receives physical ID and needs to be converted + // The physical ID is transferred to the logical ID. FMK receives physical ID + // and needs to be converted uint32_t dev_logic_index = 0; rtError_t rt_ret = rtGetDeviceIndexByPhyId(static_cast(options.device_id), &dev_logic_index); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, @@ -243,8 +272,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt GetContext().SetCtxDeviceId(dev_logic_index); GE_CHK_RT_RET(rtSetDevice(options.device_id)); - // In the scenario that the automatic add fusion is set, but there is no cleanaddr operator, - // maybe need to check it + + // In the scenario that the automatic add fusion is set, but there is no + // cleanaddr operator, maybe need to check it is_system_inited = true; is_shutdown = false; @@ -257,10 +287,10 @@ Status GELib::SystemShutdownWithOptions(const Options &options) { GELOGI("Training finalize GELib begin."); std::lock_guard lock(status_mutex_); - GE_IF_BOOL_EXEC(is_shutdown || !is_system_inited, - GELOGW("System Shutdown with options is already is_shutdown or system does not inited. " - "is_shutdown:%d is_omm_inited:%d", - is_shutdown, is_system_inited); + GE_IF_BOOL_EXEC(is_shutdown || !is_system_inited, GELOGW("System Shutdown with options is already is_shutdown " + "or system does not inited. " + "is_shutdown:%d is_omm_inited:%d", + is_shutdown, is_system_inited); return SUCCESS); GE_CHK_RT(rtDeviceReset(options.device_id)); @@ -294,7 +324,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithout static bool is_inited = false; if (is_inited) { - GELOGW("System init without options is already inited, don't need to init again."); + GELOGW( + "System init without options is already inited, don't need to init " + "again."); return SUCCESS; } is_inited = true; @@ -313,27 +345,26 @@ Status GELib::Finalize() { GELOGW("not initialize"); return SUCCESS; } - + Status final_state = SUCCESS; + Status mid_state; GELOGI("engineManager finalization."); - Status final_em_status = engine_manager_.Finalize(); - GELOGI("sessionManager finalization."); - Status final_sm_status = session_manager_.Finalize(); - - if (final_em_status != SUCCESS) { - GELOGE(final_em_status); - return final_em_status; + mid_state = engineManager_.Finalize(); + if (mid_state != SUCCESS) { + GELOGW("engineManager finalize failed"); + final_state = mid_state; } - - if (final_sm_status != SUCCESS) { - GELOGE(final_sm_status); - return final_sm_status; + GELOGI("sessionManager finalization."); + mid_state = sessionManager_.Finalize(); + if (mid_state != SUCCESS) { + GELOGW("sessionManager finalize failed"); + final_state = mid_state; } GELOGI("opsManager finalization."); - Status final_ops_status = ops_manager_.Finalize(); - if (final_ops_status != SUCCESS) { - GELOGE(final_ops_status); - return final_ops_status; + mid_state = opsManager_.Finalize(); + if (mid_state != SUCCESS) { + GELOGW("opsManager finalize failed"); + final_state = mid_state; } GELOGI("VarManagerPool finalization."); @@ -342,20 +373,25 @@ Status GELib::Finalize() { GELOGI("MemManager finalization."); MemManager::Instance().Finalize(); -#ifdef DAVINCI_CLOUD + GELOGI("HostCpuEngine finalization."); + HostCpuEngine::GetInstance().Finalize(); + if (is_train_mode_) { GELOGI("System ShutDown."); - Status shutdown_status = SystemShutdownWithOptions(this->options_); - if (shutdown_status != SUCCESS) { - GELOGE(shutdown_status); - return shutdown_status; + mid_state = SystemShutdownWithOptions(this->options_); + if (mid_state != SUCCESS) { + GELOGW("System shutdown with options failed"); + final_state = mid_state; } } is_train_mode_ = false; -#endif instancePtr_ = nullptr; init_flag_ = false; + if (final_state != SUCCESS) { + GELOGE(FAILED, "MemManager finalization."); + return final_state; + } GELOGI("finalization success."); return SUCCESS; } @@ -364,14 +400,14 @@ Status GELib::Finalize() { std::shared_ptr GELib::GetInstance() { return instancePtr_; } void GELib::RollbackInit() { - if (engine_manager_.init_flag_) { - (void)engine_manager_.Finalize(); + if (engineManager_.init_flag_) { + (void)engineManager_.Finalize(); } - if (ops_manager_.init_flag_) { - (void)ops_manager_.Finalize(); + if (opsManager_.init_flag_) { + (void)opsManager_.Finalize(); } - if (session_manager_.init_flag_) { - (void)session_manager_.Finalize(); + if (sessionManager_.init_flag_) { + (void)sessionManager_.Finalize(); } MemManager::Instance().Finalize(); VarManagerPool::Instance().Destroy(); diff --git a/src/ge/init/gelib.h b/src/ge/init/gelib.h index 06cb07ca..0945907a 100644 --- a/src/ge/init/gelib.h +++ b/src/ge/init/gelib.h @@ -48,19 +48,19 @@ class GELib { Status Finalize(); // get DNNEngineManager object - DNNEngineManager &DNNEngineManagerObj() { return engine_manager_; } + DNNEngineManager &DNNEngineManagerObj() { return engineManager_; } // get OpsKernelManager object - OpsKernelManager &OpsKernelManagerObj() { return ops_manager_; } + OpsKernelManager &OpsKernelManagerObj() { return opsManager_; } // get SessionManager object - SessionManager &SessionManagerObj() { return session_manager_; } + SessionManager &SessionManagerObj() { return sessionManager_; } // get Initial flag bool InitFlag() const { return init_flag_; } // get TrainMode flag - bool isTrainMode() const { return is_train_mode_; } + bool isTrainMode() { return is_train_mode_; } // add head stream to model bool HeadStream() const { return head_stream_; } @@ -77,9 +77,9 @@ class GELib { void RollbackInit(); void InitOptions(const map &options); - DNNEngineManager engine_manager_; - OpsKernelManager ops_manager_; - SessionManager session_manager_; + DNNEngineManager engineManager_; + OpsKernelManager opsManager_; + SessionManager sessionManager_; std::mutex status_mutex_; bool init_flag_ = false; Options options_; diff --git a/src/ge/ir_build/ge_ir_build.cc b/src/ge/ir_build/ge_ir_build.cc new file mode 100644 index 00000000..671a34af --- /dev/null +++ b/src/ge/ir_build/ge_ir_build.cc @@ -0,0 +1,292 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "external/ge/ge_ir_build.h" + +#include +#include "generator/ge_generator.h" +#include "model/ge_model.h" +#include "graph/ge_tensor.h" +#include "init/gelib.h" +#include "ge/ge_api_types.h" +#include "graph/compute_graph.h" +#include "graph/utils/type_utils.h" +#include "external/register/register_types.h" +#include "common/auth/file_saver.h" +#include "offline/atc_ir_common.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/ge_inner_error_codes.h" +#include "framework/omg/omg_inner_types.h" +#include "framework/common/types.h" +#include "framework/common/util.h" +#include "framework/common/string_util.h" +#include "framework/omg/omg_inner_types.h" + +using domi::GetContext; +using ge::FileSaver; +using ge::GRAPH_PARAM_INVALID; +using ge::GRAPH_SUCCESS; +using ge::ParseInputShape; +using std::string; +using namespace std; + +namespace ge { + +static std::map input_format_str_to_geformat = { + {"ND", domi::DOMI_TENSOR_ND}, {"NCHW", domi::DOMI_TENSOR_NCHW}, {"NHWC", domi::DOMI_TENSOR_NHWC}, + {"CHWN", domi::DOMI_TENSOR_CHWN}, {"NC1HWC0", domi::DOMI_TENSOR_NC1HWC0}, {"NHWC1C0", domi::DOMI_TENSOR_NHWC1C0}, +}; +const std::string IR_OPTION_TARGET = "target"; +const std::string IR_OPTION_MODE = "mode"; +const std::string IR_OP_CONF_DELIMITER = ":"; + +graphStatus aclgrphBuildInitialize(std::map global_options) { + GELOGD("Enter aclgrphInitialize start!"); + std::shared_ptr instance_ptr = ge::GELib::GetInstance(); + if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { + GELOGI("aclgrphInitialize start!"); + auto ret = ge::GELib::Initialize(global_options); + if (ret != ge::SUCCESS) { + GELOGE(ret, "GE initialize failed!"); + return GRAPH_FAILED; + } + } + GELOGW("gelib has been initialized!"); + return GRAPH_SUCCESS; +} + +void aclgrphBuildFinalize() { + if (ge::GELib::GetInstance() != nullptr && ge::GELib::GetInstance()->InitFlag()) { + (void)ge::GELib::GetInstance()->Finalize(); + return; + } + GELOGW("[Notice] gelib has not been initialized!do nothing!"); +} + +class Impl { + public: + Impl() { + GetContext().format = domi::DOMI_TENSOR_ND; + GetContext().input_nodes_format_map.clear(); + GetContext().output_formats.clear(); + GetContext().user_input_dims.clear(); + GetContext().input_dims.clear(); + GetContext().op_conf_map.clear(); + GetContext().out_nodes_map.clear(); + GetContext().user_out_nodes.clear(); + GetContext().net_format = domi::DOMI_TENSOR_RESERVED; + GetContext().type = domi::FRAMEWORK_RESERVED; + GetContext().run_mode = ONLY_PRE_CHECK; + GetContext().train_flag = false; + GetContext().fp16_high_precision = HIGH_PRECISION_DEFAULT; + GetContext().output_type.clear(); + GetContext().net_name.clear(); + GetContext().is_dynamic_input = false; + GetContext().dynamic_batch_size.clear(); + GetContext().dynamic_image_size.clear(); + }; + ~Impl() { (void)generator_.Finalize(); }; + graphStatus CheckOptions(const std::map &options); + graphStatus CreateInputsForIRBuild(const ge::Graph &graph, vector &inputs); + graphStatus Init(const std::map &options); + graphStatus BuildModel(const Graph &graph, const std::map &options, + ModelBufferData &ge_models); + graphStatus InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format, + bool is_dynamic_input); + + public: + ge::GeGenerator generator_; + std::map options_; + bool is_dynamic_input_ = false; +}; + +graphStatus Impl::CheckOptions(const std::map &options) { + for (auto &ele : options) { + auto it = ge::ir_option::ir_builder_suppported_options.find(ele.first); + if (it == ge::ir_option::ir_builder_suppported_options.end()) { + GELOGE(GRAPH_PARAM_INVALID, "input options include unsupported option(%s).Please check!", ele.first.c_str()); + return GRAPH_PARAM_INVALID; + } + options_.insert(ele); + } + return GRAPH_SUCCESS; +} +graphStatus Impl::Init(const std::map &options) { + // 1. check options + graphStatus ret = CheckOptions(options); + if (ret != GRAPH_SUCCESS) { + GELOGE(ret, "user input options is not illegal!Please check!"); + return ret; + } + + auto iter = options_.find(ge::ir_option::OP_NAME_MAP); + if (iter != options_.end()) { + // divided by ":" + PropertiesManager::Instance().SetPropertyDelimiter(IR_OP_CONF_DELIMITER); + // Parsing the op_conf configuration item file + GE_RETURN_WITH_LOG_IF_FALSE(PropertiesManager::Instance().Init(iter->second), "op_name_map init failed!"); + // Return map and put it into ATC global variable + GetContext().op_conf_map.clear(); + GetContext().op_conf_map = PropertiesManager::Instance().GetPropertyMap(); + } + + string input_shape = options_.find("input_shape") == options_.end() ? "" : options_["input_shape"]; + string input_format = options_.find("input_format") == options_.end() ? "" : options_["input_format"]; + string net_format = options_.find("net_format") == options_.end() ? "" : options_["net_format"]; + string dynamic_batch_size = options_.find(ge::ir_option::DYNAMIC_BATCH_SIZE) == options_.end() + ? "" + : options_[ge::ir_option::DYNAMIC_BATCH_SIZE]; + string dynamic_image_size = options_.find(ge::ir_option::DYNAMIC_IMAGE_SIZE) == options_.end() + ? "" + : options_[ge::ir_option::DYNAMIC_IMAGE_SIZE]; + + auto status = CheckDynamicBatchSizeOrImageSizeParamValid(dynamic_batch_size, dynamic_image_size, input_shape, + input_format, is_dynamic_input_); + if (status != ge::SUCCESS) { + GELOGE(GRAPH_PARAM_INVALID, "check dynamic batch size or image size failed!"); + return GRAPH_PARAM_INVALID; + } + GELOGD("user input dynamic_batch_size:%s dynamic_image_size:%s", dynamic_batch_size.c_str(), + dynamic_image_size.c_str()); + GetContext().dynamic_batch_size = dynamic_batch_size; + GetContext().dynamic_image_size = dynamic_image_size; + + // for IR builder.Only support om mode, so here fixed; + options_.insert(std::pair(string(IR_OPTION_MODE), to_string(0))); + options_.insert(std::pair(string(IR_OPTION_TARGET), "mini")); + options_.insert(std::pair(string(ge::RUN_FLAG), to_string(0))); + options_.insert(std::pair(string(ge::TRAIN_FLAG), to_string(0))); + options_.insert(std::pair(string(ge::SAVE_ORIGINAL_MODEL), to_string(0))); + + // 3. init generator with options_ + ret = generator_.Initialize(options_); + if (ret != GRAPH_SUCCESS) { + GELOGE(ret, "generator Initialize failed!"); + return ret; + } + // 4.parse and init Context with input shape format and net format info + return this->InitDomiOmgContext(input_shape, input_format, net_format, is_dynamic_input_); +} +graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector &inputs) { + auto compute_graph = ge::GraphUtils::GetComputeGraph(graph); + GE_CHECK_NOTNULL(compute_graph); + for (ge::NodePtr &input_node : compute_graph->GetDirectNode()) { + GE_CHECK_NOTNULL(input_node); + ge::OpDescPtr op = input_node->GetOpDesc(); + GE_CHECK_NOTNULL(op); + if (op->GetType() == DATA) { + GELOGI("Data op inputDesc size is: %zu", op->GetAllInputsDesc().size()); + ge::GeTensorDesc tensor = op->GetInputDesc(0); + string data_op_name = op->GetName(); + GELOGI("Data op name is: %s", data_op_name.c_str()); + ge::GeShape data_shape; + auto iter = GetContext().input_dims.find(data_op_name); + if (iter != GetContext().input_dims.end()) { + data_shape = ge::GeShape(iter->second); + GELOGI("Data op get shape from Context."); + } else { + data_shape = tensor.GetShape(); + GELOGI("Data op get shape from InputDesc in geir graph."); + } + + ge::DataType data_type = tensor.GetDataType(); + string data_type_str = ge::TypeUtils::DataTypeToSerialString(data_type); + GELOGI("Data op get data type:%s from InputDesc in ge ir graph.", data_type_str.c_str()); + + ge::GeTensor inputTensor; + ge::GeTensorDesc desc(data_shape, ge::Format(GetContext().format), data_type); + inputTensor.SetTensorDesc(desc); + inputs.push_back(inputTensor); + } + } + GELOGD("CreateInputsForIRBuild, inputs size is: %zu", inputs.size()); + return GRAPH_SUCCESS; +} +graphStatus Impl::BuildModel(const Graph &graph, const std::map &options, + ModelBufferData &model) { + // 1. init GeGenerator with user optios + graphStatus ret = Init(options); + if (ret != GRAPH_SUCCESS) { + GELOGE(ret, "Build IR model Init Failed!"); + return ret; + } + + // 2. construct input + std::vector inputs; + if (!GetContext().is_dynamic_input) { // if dynamic input , no need to creat inputs + ret = CreateInputsForIRBuild(graph, inputs); + if (ret != GRAPH_SUCCESS) { + GELOGE(ret, "CreateInputsForIRBuild failed!"); + return ret; + } + } + + // 3. build IR model + ret = generator_.GenerateOnlineModel(graph, inputs, model); + + if (ret != GRAPH_SUCCESS) { + GELOGE(ret, "GenerateOnlineModel failed!"); + return ret; + } + + return GRAPH_SUCCESS; +} +graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format, + bool is_dynamic_input) { + // Clear omgcontext data first + GetContext().input_dims.clear(); + GetContext().user_input_dims.clear(); + GetContext().is_dynamic_input = is_dynamic_input; + // the default value is ND + GetContext().format = domi::DOMI_TENSOR_ND; + if (!input_format.empty()) { + auto iter = input_format_str_to_geformat.find(input_format); + if (iter != input_format_str_to_geformat.end()) { + GetContext().format = iter->second; + } else { + GELOGE(GRAPH_PARAM_INVALID, "Input format %s not support , expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", + input_format.c_str()); + return GRAPH_PARAM_INVALID; + } + } + // Input is empty, do not process + if (input_shape.empty()) { + return GRAPH_SUCCESS; + } + + if (!ParseInputShape(input_shape, GetContext().input_dims, GetContext().user_input_dims, is_dynamic_input)) { + GELOGE(GRAPH_PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str()); + return GRAPH_PARAM_INVALID; + } + return GRAPH_SUCCESS; +} + +graphStatus aclgrphBuildModel(const ge::Graph &graph, const std::map &build_options, + ModelBufferData &model) { + GELOGD("Enter aclmdlBuildModel process!"); + Impl builder; + return builder.BuildModel(graph, build_options, model); +} + +graphStatus aclgrphSaveModel(const string &output_file, const ModelBufferData &model) { + GELOGD("Enter aclmdlSaveModel process!"); + if (model.data.get() == nullptr || model.length == 0) { + GELOGE(GRAPH_PARAM_INVALID, "input model is not illegal"); + return GRAPH_PARAM_INVALID; + } + return FileSaver::SaveToFile((output_file + ".om"), (void *)model.data.get(), static_cast(model.length)); +} +} // namespace ge diff --git a/src/ge/model/ge_model.cc b/src/ge/model/ge_model.cc index 36b564b8..348f8416 100644 --- a/src/ge/model/ge_model.cc +++ b/src/ge/model/ge_model.cc @@ -25,6 +25,7 @@ void GeModel::Init() { (void)AttrUtils::SetInt(this, ATTR_MODEL_MEMORY_SIZE, 0); (void)AttrUtils::SetInt(this, ATTR_MODEL_STREAM_NUM, 0); (void)AttrUtils::SetInt(this, ATTR_MODEL_EVENT_NUM, 0); + (void)AttrUtils::SetInt(this, ATTR_MODEL_LABEL_NUM, 0); (void)AttrUtils::SetInt(this, ATTR_MODEL_WEIGHT_SIZE, 0); (void)AttrUtils::SetStr(this, ATTR_MODEL_TARGET_TYPE, TARGET_TYPE_MINI); version_ = 0; diff --git a/src/ge/omm/csa_interact.cc b/src/ge/omm/csa_interact.cc index 4b50f966..dd3f6240 100644 --- a/src/ge/omm/csa_interact.cc +++ b/src/ge/omm/csa_interact.cc @@ -30,10 +30,7 @@ namespace { const char FMK_STATUS_FILE_DIR_ENV[] = "FMK_STATUS_FILE_DIR"; const char JOBSTATE_FILE_NAME[] = "jobstateupdate_framework"; const char HCOM_DETECT_FILE_NAME[] = "hcom_detection_result"; - const char FILE_SEPARATE[] = "/"; - -const uint32_t CSA_DIR_RWX_RIGHT = 0750; } // namespace /// diff --git a/src/ge/opskernel_manager/ops_kernel_manager.cc b/src/ge/opskernel_manager/ops_kernel_manager.cc index 5c18b213..0785ad81 100644 --- a/src/ge/opskernel_manager/ops_kernel_manager.cc +++ b/src/ge/opskernel_manager/ops_kernel_manager.cc @@ -341,7 +341,7 @@ const map &OpsKernelManager::GetAllOpsKernelInfoS const map &OpsKernelManager::GetAllGraphOptimizerObjs() const { return graph_optimizers_; } void OpsKernelManager::GetGraphOptimizerByEngine(const std::string &engine_name, - vector &graph_optimizer) const { + vector &graph_optimizer) { for (const auto &it : graph_optimizers_) { GraphOptimizerAttribute attrs; if (it.second->GetAttributes(attrs) != SUCCESS) { diff --git a/src/ge/opskernel_manager/ops_kernel_manager.h b/src/ge/opskernel_manager/ops_kernel_manager.h index f779d2ec..d83b7bc4 100644 --- a/src/ge/opskernel_manager/ops_kernel_manager.h +++ b/src/ge/opskernel_manager/ops_kernel_manager.h @@ -32,8 +32,8 @@ #include "ge/ge_api_types.h" #include "runtime/base.h" -using std::string; using std::map; +using std::string; using std::vector; namespace ge { @@ -59,7 +59,7 @@ class OpsKernelManager { const map &GetAllGraphOptimizerObjs() const; // get subgraphOptimizer by engine name - void GetGraphOptimizerByEngine(const std::string &engine_name, vector &graph_optimizer) const; + void GetGraphOptimizerByEngine(const std::string &engine_name, vector &graph_optimizer); // get enableFeFlag bool GetEnableFeFlag() const; diff --git a/src/ge/session/session_manager.cc b/src/ge/session/session_manager.cc index 0c9685df..ebe0b188 100644 --- a/src/ge/session/session_manager.cc +++ b/src/ge/session/session_manager.cc @@ -67,21 +67,21 @@ Status SessionManager::CreateSession(const std::map &o SessionId next_session_id = 0; std::lock_guard lock(mutex_); - Status next_session_id_ret = GetNextSessionId(next_session_id); - if (next_session_id_ret != SUCCESS) { - return next_session_id_ret; + Status nextSessionIdRet = GetNextSessionId(next_session_id); + if (nextSessionIdRet != SUCCESS) { + return nextSessionIdRet; } - SessionPtr session_ptr = MakeShared(next_session_id, options); - if (session_ptr == nullptr) { + SessionPtr sessionPtr = MakeShared(next_session_id, options); + if (sessionPtr == nullptr) { return MEMALLOC_FAILED; } - Status ret = session_ptr->Initialize(); + Status ret = sessionPtr->Initialize(); if (ret != SUCCESS) { return ret; } - (void)session_manager_map_.emplace(std::pair(next_session_id, session_ptr)); + (void)session_manager_map_.emplace(std::pair(next_session_id, sessionPtr)); session_id = next_session_id; // create a context @@ -108,8 +108,8 @@ Status SessionManager::DestroySession(SessionId session_id) { // Unified destruct rt_context RtContextUtil::GetInstance().DestroyrtContexts(); - SessionPtr inner_session = it->second; - Status ret = inner_session->Finalize(); + SessionPtr innerSession = it->second; + Status ret = innerSession->Finalize(); if (ret != SUCCESS) { return ret; } @@ -122,17 +122,17 @@ Status SessionManager::GetVariable(SessionId session_id, const std::string &name GELOGE(GE_SESSION_MANAGER_NOT_INIT); return GE_SESSION_MANAGER_NOT_INIT; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); std::map::iterator it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { return GE_SESSION_NOT_EXIST; } else { - inner_session = it->second; + innerSession = it->second; } } - return inner_session->GetVariable(name, val); + return innerSession->GetVariable(name, val); } Status SessionManager::AddGraph(SessionId session_id, uint32_t graph_id, const Graph &graph) { @@ -146,14 +146,14 @@ Status SessionManager::AddGraph(SessionId session_id, uint32_t graph_id, const G GELOGE(GE_SESSION_MANAGER_NOT_INIT); return GE_SESSION_MANAGER_NOT_INIT; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); std::map::iterator it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { return GE_SESSION_NOT_EXIST; } else { - inner_session = it->second; + innerSession = it->second; } auto compute_graph = GraphUtils::GetComputeGraph(graph); std::string session_graph_id = std::to_string(session_id) + "_" + std::to_string(graph_id); @@ -163,7 +163,7 @@ Status SessionManager::AddGraph(SessionId session_id, uint32_t graph_id, const G GELOGD("Set graph session_graph_id attr to [%s]", session_graph_id.c_str()); } } - return inner_session->AddGraph(graph_id, graph); + return innerSession->AddGraph(graph_id, graph, options); } Status SessionManager::RunGraph(SessionId session_id, uint32_t graph_id, const std::vector &inputs, @@ -172,17 +172,17 @@ Status SessionManager::RunGraph(SessionId session_id, uint32_t graph_id, const s GELOGE(GE_SESSION_MANAGER_NOT_INIT); return GE_SESSION_MANAGER_NOT_INIT; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); std::map::iterator it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { return GE_SESSION_NOT_EXIST; } else { - inner_session = it->second; + innerSession = it->second; } } - return inner_session->RunGraph(graph_id, inputs, outputs); + return innerSession->RunGraph(graph_id, inputs, outputs); } Status SessionManager::RemoveGraph(SessionId session_id, uint32_t graph_id) { @@ -190,17 +190,17 @@ Status SessionManager::RemoveGraph(SessionId session_id, uint32_t graph_id) { GELOGE(GE_SESSION_MANAGER_NOT_INIT); return GE_SESSION_MANAGER_NOT_INIT; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); std::map::iterator it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { return GE_SESSION_NOT_EXIST; } else { - inner_session = it->second; + innerSession = it->second; } } - return inner_session->RemoveGraph(graph_id); + return innerSession->RemoveGraph(graph_id); } bool SessionManager::HasSession(SessionId session_id) { @@ -229,17 +229,17 @@ Status SessionManager::RegisterCallBackFunc( GELOGE(GE_SESSION_MANAGER_NOT_INIT); return GE_SESSION_MANAGER_NOT_INIT; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); std::map::iterator it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { return GE_SESSION_NOT_EXIST; } else { - inner_session = it->second; + innerSession = it->second; } } - return inner_session->RegisterCallBackFunc(key, callback); + return innerSession->RegisterCallBackFunc(key, callback); } Status SessionManager::RunGraphAsync(SessionId session_id, uint32_t graph_id, const std::vector &inputs, @@ -248,24 +248,24 @@ Status SessionManager::RunGraphAsync(SessionId session_id, uint32_t graph_id, co GELOGE(GE_SESSION_MANAGER_NOT_INIT); return GE_SESSION_MANAGER_NOT_INIT; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); std::map::iterator it = session_manager_map_.find(session_id); if (it == session_manager_map_.end()) { return GE_SESSION_NOT_EXIST; } else { - inner_session = it->second; + innerSession = it->second; } } - return inner_session->RunGraphAsync(graph_id, inputs, outputs, callback); + return innerSession->RunGraphAsync(graph_id, inputs, outputs, callback); } bool SessionManager::IsGraphNeedRebuild(SessionId session_id, uint32_t graph_id) { if (!init_flag_) { GELOGE(GE_SESSION_MANAGER_NOT_INIT); return true; } - SessionPtr inner_session = nullptr; + SessionPtr innerSession = nullptr; { std::lock_guard lock(mutex_); auto it = session_manager_map_.find(session_id); @@ -273,9 +273,9 @@ bool SessionManager::IsGraphNeedRebuild(SessionId session_id, uint32_t graph_id) GELOGE(GE_SESSION_NOT_EXIST, "The session %lu does not exists", session_id); return true; } else { - inner_session = it->second; + innerSession = it->second; } } - return inner_session->IsGraphNeedRebuild(graph_id); + return innerSession->IsGraphNeedRebuild(graph_id); } }; // namespace ge diff --git a/src/ge/session/session_manager.h b/src/ge/session/session_manager.h index 3fd8cf6f..10ff3edf 100644 --- a/src/ge/session/session_manager.h +++ b/src/ge/session/session_manager.h @@ -33,7 +33,7 @@ class SessionManager { friend class GELib; public: - Status SetrtContext(rtContext_t rt_context); + Status SetrtContext(rtContext_t rtContext); /// /// @ingroup ge_session /// @brief create session diff --git a/src/ge/single_op/single_op.cc b/src/ge/single_op/single_op.cc index 59a17d38..475e463f 100644 --- a/src/ge/single_op/single_op.cc +++ b/src/ge/single_op/single_op.cc @@ -143,15 +143,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c return ret; } } - if (ProfilingManager::Instance().ProfilingOpTraceOn()) { - GELOGI("Op trace on, iter num:%d", ProfilingManager::Instance().GetOpTraceIterNum()); - ret = rtStreamSynchronize(stream_); - if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Invoke rtStreamSynchronize failed."); - return ret; - } - ProfilingManager::Instance().StopProfiling(); - } return ret; } diff --git a/src/ge/single_op/single_op_model.cc b/src/ge/single_op/single_op_model.cc index 51a77694..1cede863 100644 --- a/src/ge/single_op/single_op_model.cc +++ b/src/ge/single_op/single_op_model.cc @@ -29,9 +29,9 @@ #include "runtime/rt.h" #include "task/tbe_task_builder.h" -using std::vector; -using std::unique_ptr; using domi::TaskDef; +using std::unique_ptr; +using std::vector; namespace ge { namespace { @@ -77,13 +77,11 @@ void SingleOpModel::ParseOpModelParams(ModelHelper &model_helper, SingleOpModelP ret = ge::AttrUtils::GetInt(model, MODEL_ATTR_TASK_GEN_WEIGHT_ADDR, value); param.weight_addr = ret ? static_cast(value) : 0; - GELOGI("ParseOpModelParams(), memory_size:%lu, weight_size:%lu.", param.memory_size, - param.weight_size); + GELOGI("ParseOpModelParams(), memory_size:%lu, weight_size:%lu.", param.memory_size, param.weight_size); } Status SingleOpModel::InitModelMem(StreamResource &res) { ParseOpModelParams(model_helper_, model_params_); - if (model_params_.memory_size > ALLOC_MEMORY_MAX_SIZE || model_params_.weight_size > ALLOC_MEMORY_MAX_SIZE) { GELOGE(PARAM_INVALID, "Can not alloc memory larger than %lu. memory size = %lu, weight size = %lu", ALLOC_MEMORY_MAX_SIZE, model_params_.memory_size, model_params_.weight_size); @@ -117,11 +115,11 @@ Status SingleOpModel::ParseInputNode(const OpDescPtr &op_desc) { auto output_desc = op_desc->GetOutputDescPtr(0); GE_CHECK_NOTNULL(output_desc); - uint32_t tensor_size = 0; + int64_t tensor_size = 0; (void)TensorUtils::GetSize(*output_desc, tensor_size); input_offset_list_.emplace_back(offsets[0]); input_sizes_.emplace_back(tensor_size); - GELOGI("[%s] parse input node: %s, size = %u, offset = %u", model_name_.c_str(), op_desc->GetName().c_str(), + GELOGI("[%s] parse input node: %s, size = %ld, offset = %u", model_name_.c_str(), op_desc->GetName().c_str(), tensor_size, static_cast(offsets[0])); return SUCCESS; } @@ -133,11 +131,11 @@ void SingleOpModel::ParseOutputNode(const OpDescPtr &op_desc) { if (input_desc == nullptr) { continue; } - uint32_t tensor_size = 0; + int64_t tensor_size = 0; (void)TensorUtils::GetSize(*input_desc, tensor_size); output_offset_list_.emplace_back(offsets[k]); output_sizes_.emplace_back(tensor_size); - GELOGI("[%s] parse output node: %s, size = %u, offset = %u", model_name_.c_str(), op_desc->GetName().c_str(), + GELOGI("[%s] parse output node: %s, size = %ld, offset = %u", model_name_.c_str(), op_desc->GetName().c_str(), tensor_size, static_cast(offsets[k])); } } @@ -152,7 +150,7 @@ Status SingleOpModel::ParseInputsAndOutputs() { return PARAM_INVALID; } - auto nodes = compute_graph->GetAllNodes(); + auto nodes = compute_graph->GetDirectNode(); size_t model_op_size = nodes.size(); GELOGI("[%s] node size = %zu", model_name_.c_str(), model_op_size); diff --git a/src/ge/single_op/stream_resource.cc b/src/ge/single_op/stream_resource.cc index 0ba51fe3..53dfb183 100644 --- a/src/ge/single_op/stream_resource.cc +++ b/src/ge/single_op/stream_resource.cc @@ -44,9 +44,7 @@ StreamResource::~StreamResource() { } } -void StreamResource::CacheOperator(const void *key, SingleOp *single_op) { - op_map_[key] = single_op; -} +void StreamResource::CacheOperator(const void *key, SingleOp *single_op) { op_map_[key] = single_op; } SingleOp *StreamResource::GetOperator(const void *key) { auto it = op_map_.find(key); @@ -69,6 +67,7 @@ uint8_t *StreamResource::DoMallocMemory(size_t size, size_t &max_allocated, std: GELOGE(RT_FAILED, "rtMalloc failed, size = %zu, ret = %d", size, ret); return nullptr; } + GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "malloc function.", size) ret = rtMemset(buffer, size, 0U, size); if (ret != RT_ERROR_NONE) { diff --git a/src/ge/single_op/task/tbe_task_builder.cc b/src/ge/single_op/task/tbe_task_builder.cc index c2c56398..b8911d0c 100644 --- a/src/ge/single_op/task/tbe_task_builder.cc +++ b/src/ge/single_op/task/tbe_task_builder.cc @@ -22,8 +22,8 @@ #include "common/helper/model_helper.h" #include "framework/common/debug/ge_log.h" -#include "graph/debug/ge_attr_define.h" #include "graph/load/new_model_manager/model_utils.h" +#include "graph/debug/ge_attr_define.h" #include "graph/load/new_model_manager/task_info/task_info.h" #include "graph/manager/graph_var_manager.h" #include "runtime/rt.h" @@ -201,8 +201,8 @@ Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task) { return INTERNAL_ERROR; } } else { - delete holder; - holder = nullptr; + delete holder; + holder = nullptr; } } diff --git a/src/proto/op_mapping_info.proto b/src/proto/op_mapping_info.proto index 2bf26f7a..ea4c4a8d 100644 --- a/src/proto/op_mapping_info.proto +++ b/src/proto/op_mapping_info.proto @@ -42,6 +42,7 @@ message Task { uint32 stream_id = 2; Op op = 3; repeated Output output = 4; + bool end_graph = 5; }; message OpMappingInfo { @@ -63,4 +64,5 @@ message OpMappingInfo { } uint32 flag = 7; // 0x01 load, 0x00 unload repeated Task task = 8; + string dump_step = 9; }; \ No newline at end of file diff --git a/tests/depends/cce/CMakeLists.txt b/tests/depends/cce/CMakeLists.txt index 21ff8231..70516146 100644 --- a/tests/depends/cce/CMakeLists.txt +++ b/tests/depends/cce/CMakeLists.txt @@ -28,6 +28,7 @@ include_directories(${GE_SOURCE_DIR}/src/common) include_directories(${GE_SOURCE_DIR}/src/common/graph) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce) +include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops) include_directories(${GE_SOURCE_DIR}/third_party/securec/include) include_directories(${CMAKE_BINARY_DIR}) include_directories(${CMAKE_BINARY_DIR}/proto/ge) diff --git a/tests/depends/cce/src/op_kernel_registry.cc b/tests/depends/cce/src/op_kernel_registry.cc new file mode 100644 index 00000000..5ccd1391 --- /dev/null +++ b/tests/depends/cce/src/op_kernel_registry.cc @@ -0,0 +1,29 @@ +#include "register/op_kernel_registry.h" + +namespace ge { +class OpKernelRegistry::OpKernelRegistryImpl { + +}; + +OpKernelRegistry::OpKernelRegistry() { +} + +OpKernelRegistry::~OpKernelRegistry() { + +} + +bool OpKernelRegistry::IsRegistered(const std::string &op_type) { + return false; +} + +std::unique_ptr OpKernelRegistry::CreateHostCpuOp(const std::string &op_type) { + return nullptr; +} + +void OpKernelRegistry::RegisterHostCpuOp(const std::string &op_type, CreateFn create_fn) { +} + +HostCpuOpRegistrar::HostCpuOpRegistrar(const char *op_type, HostCpuOp *(*create_fn)()) { + +} +} // namespace ge \ No newline at end of file diff --git a/tests/depends/mmpa/src/mmpa_stub.cc b/tests/depends/mmpa/src/mmpa_stub.cc index bdf24326..76d32cad 100644 --- a/tests/depends/mmpa/src/mmpa_stub.cc +++ b/tests/depends/mmpa/src/mmpa_stub.cc @@ -208,3 +208,12 @@ INT32 mmGetFileSize(const CHAR *file_name, ULONGLONG *length) { *length = (ULONGLONG)file_stat.st_size; return EN_OK; } + +INT32 mmScandir(const CHAR *path, mmDirent ***entryList, mmFilter filterFunc, mmSort sort) +{ + return 0; +} + +VOID mmScandirFree(mmDirent **entryList, INT32 count) +{ +} diff --git a/tests/depends/runtime/src/runtime_stub.cc b/tests/depends/runtime/src/runtime_stub.cc index 5ab36af3..e444cc87 100644 --- a/tests/depends/runtime/src/runtime_stub.cc +++ b/tests/depends/runtime/src/runtime_stub.cc @@ -105,6 +105,10 @@ rtError_t rtMemcpyAsync(void *dst, uint64_t dest_max, const void *src, uint64_t rtError_t rtStreamWaitEvent(rtStream_t stream, rtEvent_t event) { return RT_ERROR_NONE; } +rtError_t rtSetTSDevice(uint32_t tsId) { + return RT_ERROR_NONE; +} + rtError_t rtGetDeviceCount(int32_t *count) { *count = 1; return RT_ERROR_NONE; @@ -201,7 +205,11 @@ rtError_t rtGetFunctionByName(const char *stub_name, void **stub_func) { *(char **)stub_func = "func"; return RT_ERROR_NONE; } - +rtError_t rtGetAddrByFun(const void *stubFunc, void **addr) +{ + *(char**)addr = "dev_func"; + return RT_ERROR_NONE; +} rtError_t rtQueryFunctionRegistered(const char *stub_name) { return RT_ERROR_NONE; } rtError_t rtCtxCreate(rtContext_t *ctx, uint32_t flags, int32_t device) { return RT_ERROR_NONE; } @@ -218,6 +226,10 @@ rtError_t rtModelGetTaskId(void *handle, uint32_t *task_id) { return RT_ERROR_NONE; } rtError_t rtEndGraph(rtModel_t model, rtStream_t stream) { return RT_ERROR_NONE; } +rtError_t rtEndGraphEx(rtModel_t model, rtStream_t stream, uint32_t flags) +{ + return RT_ERROR_NONE; +} rtError_t rtProfilerStop(void) { return RT_ERROR_NONE; } rtError_t rtSetDvfsProfile(DvfsProfileMode mode) { return RT_ERROR_NONE; } @@ -284,4 +296,14 @@ rtError_t rtKernelLaunchWithFlag(const void *stub_func, uint32_t block_dim, void rtError_t rtCpuKernelLaunchWithFlag(const void *so_name, const void *kernel_name, uint32_t core_dim, const void *args, uint32_t args_size, rtL2Ctrl_t *l2ctrl, rtStream_t stream_, uint32_t flags) { return RT_ERROR_NONE; -} \ No newline at end of file +} + +rtError_t rtModelGetId(rtModel_t model, uint32_t *modelId) +{ + return RT_ERROR_NONE; +} + +rtError_t rtModelBindQueue(rtModel_t model, uint32_t queueId, rtModelQueueFlag_t flag) +{ + return RT_ERROR_NONE; +} diff --git a/tests/depends/slog/src/slog_stub.cc b/tests/depends/slog/src/slog_stub.cc index a27deba1..76b5324b 100644 --- a/tests/depends/slog/src/slog_stub.cc +++ b/tests/depends/slog/src/slog_stub.cc @@ -39,3 +39,8 @@ void DlogWithKVInner(int module_id, int level, KeyValue *pst_kv_array, int kv_nu } int dlog_getlevel(int module_id, int *enable_event) { return DLOG_DEBUG; } + +int CheckLogLevel(int moduleId, int logLevel) +{ + return 1; +} diff --git a/tests/st/resnet50/common.cc b/tests/st/resnet50/common.cc index c1d54205..674ef926 100755 --- a/tests/st/resnet50/common.cc +++ b/tests/st/resnet50/common.cc @@ -506,23 +506,26 @@ bool build_multi_input_multi_output_graph(Graph &graph) { auto relu2 = op::Relu("Relu2").set_input_x(data2); auto eltwise = op::Eltwise("Eltwise") - .create_dynamic_input___input(2) - .set_dynamic_input___input(0, relu1) - .set_dynamic_input___input(1, relu2) + .create_dynamic_input_x(2) + .set_dynamic_input_x(0, relu1) + .set_dynamic_input_x(1, relu2) + .set_attr_N(2) .set_attr_mode(1) .set_attr_coeff({1, 1}); auto eltwise1 = op::Eltwise("Eltwise1") - .create_dynamic_input___input(2) - .set_dynamic_input___input(0, eltwise) - .set_dynamic_input___input(1, eltwise) + .create_dynamic_input_x(2) + .set_dynamic_input_x(0, eltwise) + .set_dynamic_input_x(1, eltwise) + .set_attr_N(2) .set_attr_mode(1) .set_attr_coeff({1, 1}); auto eltwise2 = op::Eltwise("Eltwise2") - .create_dynamic_input___input(2) - .set_dynamic_input___input(0, eltwise) - .set_dynamic_input___input(1, eltwise) + .create_dynamic_input_x(2) + .set_dynamic_input_x(0, eltwise) + .set_dynamic_input_x(1, eltwise) + .set_attr_N(2) .set_attr_mode(1) .set_attr_coeff({1, 1}); @@ -538,137 +541,137 @@ void build_big_graph(Graph &graph, map> attr) { vector weight_shape(attr["weight"].begin(), attr["weight"].end()); TensorDesc weight_desc(ge::Shape(weight_shape), FORMAT_NCHW, DT_FLOAT); weight.update_output_desc_y(weight_desc); - auto conv_1 = op::Conv2D("conv1").set_input_x(data).set_input_filter(weight); - - auto conv_2 = op::Conv2D("conv2").set_input_x(conv_1).set_input_filter(weight); - auto conv_3 = op::Conv2D("conv3").set_input_x(conv_2).set_input_filter(weight); - auto conv_4 = op::Conv2D("conv4").set_input_x(conv_3).set_input_filter(weight); - auto conv_5 = op::Conv2D("conv5").set_input_x(conv_4).set_input_filter(weight); - auto conv_6 = op::Conv2D("conv6").set_input_x(conv_5).set_input_filter(weight); - auto conv_7 = op::Conv2D("conv7").set_input_x(conv_6).set_input_filter(weight); - auto conv_8 = op::Conv2D("conv8").set_input_x(conv_7).set_input_filter(weight); - auto conv_9 = op::Conv2D("conv9").set_input_x(conv_8).set_input_filter(weight); - auto conv_10 = op::Conv2D("conv10").set_input_x(conv_9).set_input_filter(weight); - auto conv_11 = op::Conv2D("conv11").set_input_x(conv_10).set_input_filter(weight); - auto conv_12 = op::Conv2D("conv12").set_input_x(conv_11).set_input_filter(weight); - auto conv_13 = op::Conv2D("conv13").set_input_x(conv_12).set_input_filter(weight); - auto conv_14 = op::Conv2D("conv14").set_input_x(conv_13).set_input_filter(weight); - auto conv_15 = op::Conv2D("conv15").set_input_x(conv_14).set_input_filter(weight); - auto conv_16 = op::Conv2D("conv16").set_input_x(conv_15).set_input_filter(weight); - auto conv_17 = op::Conv2D("conv17").set_input_x(conv_16).set_input_filter(weight); - auto conv_18 = op::Conv2D("conv18").set_input_x(conv_17).set_input_filter(weight); - auto conv_19 = op::Conv2D("conv19").set_input_x(conv_18).set_input_filter(weight); - auto conv_20 = op::Conv2D("conv20").set_input_x(conv_19).set_input_filter(weight); - auto conv_21 = op::Conv2D("conv21").set_input_x(conv_20).set_input_filter(weight); - auto conv_22 = op::Conv2D("conv22").set_input_x(conv_21).set_input_filter(weight); - auto conv_23 = op::Conv2D("conv23").set_input_x(conv_22).set_input_filter(weight); - auto conv_24 = op::Conv2D("conv24").set_input_x(conv_23).set_input_filter(weight); - auto conv_25 = op::Conv2D("conv25").set_input_x(conv_24).set_input_filter(weight); - auto conv_26 = op::Conv2D("conv26").set_input_x(conv_25).set_input_filter(weight); - auto conv_27 = op::Conv2D("conv27").set_input_x(conv_26).set_input_filter(weight); - auto conv_28 = op::Conv2D("conv28").set_input_x(conv_27).set_input_filter(weight); - auto conv_29 = op::Conv2D("conv29").set_input_x(conv_28).set_input_filter(weight); - auto conv_30 = op::Conv2D("conv30").set_input_x(conv_29).set_input_filter(weight); - auto conv_31 = op::Conv2D("conv31").set_input_x(conv_30).set_input_filter(weight); - auto conv_32 = op::Conv2D("conv32").set_input_x(conv_31).set_input_filter(weight); - auto conv_33 = op::Conv2D("conv33").set_input_x(conv_32).set_input_filter(weight); - auto conv_34 = op::Conv2D("conv34").set_input_x(conv_33).set_input_filter(weight); - auto conv_35 = op::Conv2D("conv35").set_input_x(conv_34).set_input_filter(weight); - auto conv_36 = op::Conv2D("conv36").set_input_x(conv_35).set_input_filter(weight); - auto conv_37 = op::Conv2D("conv37").set_input_x(conv_36).set_input_filter(weight); - auto conv_38 = op::Conv2D("conv38").set_input_x(conv_37).set_input_filter(weight); - auto conv_39 = op::Conv2D("conv39").set_input_x(conv_38).set_input_filter(weight); - auto conv_40 = op::Conv2D("conv40").set_input_x(conv_39).set_input_filter(weight); - auto conv_41 = op::Conv2D("conv41").set_input_x(conv_40).set_input_filter(weight); - auto conv_42 = op::Conv2D("conv42").set_input_x(conv_41).set_input_filter(weight); - auto conv_43 = op::Conv2D("conv43").set_input_x(conv_42).set_input_filter(weight); - auto conv_44 = op::Conv2D("conv44").set_input_x(conv_43).set_input_filter(weight); - auto conv_45 = op::Conv2D("conv45").set_input_x(conv_44).set_input_filter(weight); - auto conv_46 = op::Conv2D("conv46").set_input_x(conv_45).set_input_filter(weight); - auto conv_47 = op::Conv2D("conv47").set_input_x(conv_46).set_input_filter(weight); - auto conv_48 = op::Conv2D("conv48").set_input_x(conv_47).set_input_filter(weight); - auto conv_49 = op::Conv2D("conv49").set_input_x(conv_48).set_input_filter(weight); - auto conv_50 = op::Conv2D("conv50").set_input_x(conv_49).set_input_filter(weight); - auto conv_51 = op::Conv2D("conv51").set_input_x(conv_50).set_input_filter(weight); - auto conv_52 = op::Conv2D("conv52").set_input_x(conv_51).set_input_filter(weight); - auto conv_53 = op::Conv2D("conv53").set_input_x(conv_52).set_input_filter(weight); - auto conv_54 = op::Conv2D("conv54").set_input_x(conv_53).set_input_filter(weight); - auto conv_55 = op::Conv2D("conv55").set_input_x(conv_54).set_input_filter(weight); - auto conv_56 = op::Conv2D("conv56").set_input_x(conv_55).set_input_filter(weight); - auto conv_57 = op::Conv2D("conv57").set_input_x(conv_56).set_input_filter(weight); - auto conv_58 = op::Conv2D("conv58").set_input_x(conv_57).set_input_filter(weight); - auto conv_59 = op::Conv2D("conv59").set_input_x(conv_58).set_input_filter(weight); - auto conv_60 = op::Conv2D("conv60").set_input_x(conv_59).set_input_filter(weight); - auto conv_61 = op::Conv2D("conv61").set_input_x(conv_60).set_input_filter(weight); - auto conv_62 = op::Conv2D("conv62").set_input_x(conv_61).set_input_filter(weight); - auto conv_63 = op::Conv2D("conv63").set_input_x(conv_62).set_input_filter(weight); - auto conv_64 = op::Conv2D("conv64").set_input_x(conv_63).set_input_filter(weight); - auto conv_65 = op::Conv2D("conv65").set_input_x(conv_64).set_input_filter(weight); - auto conv_66 = op::Conv2D("conv66").set_input_x(conv_65).set_input_filter(weight); - auto conv_67 = op::Conv2D("conv67").set_input_x(conv_66).set_input_filter(weight); - auto conv_68 = op::Conv2D("conv68").set_input_x(conv_67).set_input_filter(weight); - auto conv_69 = op::Conv2D("conv69").set_input_x(conv_68).set_input_filter(weight); - auto conv_70 = op::Conv2D("conv70").set_input_x(conv_69).set_input_filter(weight); - auto conv_71 = op::Conv2D("conv71").set_input_x(conv_70).set_input_filter(weight); - auto conv_72 = op::Conv2D("conv72").set_input_x(conv_71).set_input_filter(weight); - auto conv_73 = op::Conv2D("conv73").set_input_x(conv_72).set_input_filter(weight); - auto conv_74 = op::Conv2D("conv74").set_input_x(conv_73).set_input_filter(weight); - auto conv_75 = op::Conv2D("conv75").set_input_x(conv_74).set_input_filter(weight); - auto conv_76 = op::Conv2D("conv76").set_input_x(conv_75).set_input_filter(weight); - auto conv_77 = op::Conv2D("conv77").set_input_x(conv_76).set_input_filter(weight); - auto conv_78 = op::Conv2D("conv78").set_input_x(conv_77).set_input_filter(weight); - auto conv_79 = op::Conv2D("conv79").set_input_x(conv_78).set_input_filter(weight); - auto conv_80 = op::Conv2D("conv80").set_input_x(conv_79).set_input_filter(weight); - auto conv_81 = op::Conv2D("conv81").set_input_x(conv_80).set_input_filter(weight); - auto conv_82 = op::Conv2D("conv82").set_input_x(conv_81).set_input_filter(weight); - auto conv_83 = op::Conv2D("conv83").set_input_x(conv_82).set_input_filter(weight); - auto conv_84 = op::Conv2D("conv84").set_input_x(conv_83).set_input_filter(weight); - auto conv_85 = op::Conv2D("conv85").set_input_x(conv_84).set_input_filter(weight); - auto conv_86 = op::Conv2D("conv86").set_input_x(conv_85).set_input_filter(weight); - auto conv_87 = op::Conv2D("conv87").set_input_x(conv_86).set_input_filter(weight); - auto conv_88 = op::Conv2D("conv88").set_input_x(conv_87).set_input_filter(weight); - auto conv_89 = op::Conv2D("conv89").set_input_x(conv_88).set_input_filter(weight); - auto conv_90 = op::Conv2D("conv90").set_input_x(conv_89).set_input_filter(weight); - auto conv_91 = op::Conv2D("conv91").set_input_x(conv_80).set_input_filter(weight); - auto conv_92 = op::Conv2D("conv92").set_input_x(conv_91).set_input_filter(weight); - auto conv_93 = op::Conv2D("conv93").set_input_x(conv_92).set_input_filter(weight); - auto conv_94 = op::Conv2D("conv94").set_input_x(conv_93).set_input_filter(weight); - auto conv_95 = op::Conv2D("conv95").set_input_x(conv_94).set_input_filter(weight); - auto conv_96 = op::Conv2D("conv96").set_input_x(conv_95).set_input_filter(weight); - auto conv_97 = op::Conv2D("conv97").set_input_x(conv_96).set_input_filter(weight); - auto conv_98 = op::Conv2D("conv98").set_input_x(conv_97).set_input_filter(weight); - auto conv_99 = op::Conv2D("conv99").set_input_x(conv_98).set_input_filter(weight); - auto conv_100 = op::Conv2D("conv100").set_input_x(conv_99).set_input_filter(weight); - auto conv_101 = op::Conv2D("conv101").set_input_x(conv_100).set_input_filter(weight); - auto conv_102 = op::Conv2D("conv102").set_input_x(conv_101).set_input_filter(weight); - auto conv_103 = op::Conv2D("conv103").set_input_x(conv_102).set_input_filter(weight); - auto conv_104 = op::Conv2D("conv104").set_input_x(conv_103).set_input_filter(weight); - auto conv_105 = op::Conv2D("conv105").set_input_x(conv_104).set_input_filter(weight); - auto conv_106 = op::Conv2D("conv106").set_input_x(conv_105).set_input_filter(weight); - auto conv_107 = op::Conv2D("conv107").set_input_x(conv_106).set_input_filter(weight); - auto conv_108 = op::Conv2D("conv108").set_input_x(conv_107).set_input_filter(weight); - auto conv_109 = op::Conv2D("conv109").set_input_x(conv_108).set_input_filter(weight); - auto conv_110 = op::Conv2D("conv110").set_input_x(conv_109).set_input_filter(weight); - auto conv_111 = op::Conv2D("conv111").set_input_x(conv_110).set_input_filter(weight); - auto conv_112 = op::Conv2D("conv112").set_input_x(conv_111).set_input_filter(weight); - auto conv_113 = op::Conv2D("conv113").set_input_x(conv_112).set_input_filter(weight); - auto conv_114 = op::Conv2D("conv114").set_input_x(conv_113).set_input_filter(weight); - auto conv_115 = op::Conv2D("conv115").set_input_x(conv_114).set_input_filter(weight); - auto conv_116 = op::Conv2D("conv116").set_input_x(conv_115).set_input_filter(weight); - auto conv_117 = op::Conv2D("conv117").set_input_x(conv_116).set_input_filter(weight); - auto conv_118 = op::Conv2D("conv118").set_input_x(conv_117).set_input_filter(weight); - auto conv_119 = op::Conv2D("conv119").set_input_x(conv_118).set_input_filter(weight); - auto conv_120 = op::Conv2D("conv120").set_input_x(conv_119).set_input_filter(weight); - auto conv_121 = op::Conv2D("conv121").set_input_x(conv_120).set_input_filter(weight); - auto conv_122 = op::Conv2D("conv122").set_input_x(conv_121).set_input_filter(weight); - auto conv_123 = op::Conv2D("conv123").set_input_x(conv_122).set_input_filter(weight); - auto conv_124 = op::Conv2D("conv124").set_input_x(conv_123).set_input_filter(weight); - auto conv_125 = op::Conv2D("conv125").set_input_x(conv_124).set_input_filter(weight); - auto conv_126 = op::Conv2D("conv126").set_input_x(conv_125).set_input_filter(weight); - auto conv_127 = op::Conv2D("conv127").set_input_x(conv_126).set_input_filter(weight); - auto conv_128 = op::Conv2D("conv128").set_input_x(conv_127).set_input_filter(weight); - auto conv_129 = op::Conv2D("conv129").set_input_x(conv_128).set_input_filter(weight); - auto conv_130 = op::Conv2D("conv130").set_input_x(conv_129).set_input_filter(weight); + auto conv_1 = op::Conv2D("conv1").set_input_x(data).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + + auto conv_2 = op::Conv2D("conv2").set_input_x(conv_1).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_3 = op::Conv2D("conv3").set_input_x(conv_2).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_4 = op::Conv2D("conv4").set_input_x(conv_3).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_5 = op::Conv2D("conv5").set_input_x(conv_4).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_6 = op::Conv2D("conv6").set_input_x(conv_5).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_7 = op::Conv2D("conv7").set_input_x(conv_6).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_8 = op::Conv2D("conv8").set_input_x(conv_7).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_9 = op::Conv2D("conv9").set_input_x(conv_8).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_10 = op::Conv2D("conv10").set_input_x(conv_9).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_11 = op::Conv2D("conv11").set_input_x(conv_10).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_12 = op::Conv2D("conv12").set_input_x(conv_11).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_13 = op::Conv2D("conv13").set_input_x(conv_12).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_14 = op::Conv2D("conv14").set_input_x(conv_13).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_15 = op::Conv2D("conv15").set_input_x(conv_14).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_16 = op::Conv2D("conv16").set_input_x(conv_15).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_17 = op::Conv2D("conv17").set_input_x(conv_16).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_18 = op::Conv2D("conv18").set_input_x(conv_17).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_19 = op::Conv2D("conv19").set_input_x(conv_18).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_20 = op::Conv2D("conv20").set_input_x(conv_19).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_21 = op::Conv2D("conv21").set_input_x(conv_20).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_22 = op::Conv2D("conv22").set_input_x(conv_21).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_23 = op::Conv2D("conv23").set_input_x(conv_22).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_24 = op::Conv2D("conv24").set_input_x(conv_23).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_25 = op::Conv2D("conv25").set_input_x(conv_24).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_26 = op::Conv2D("conv26").set_input_x(conv_25).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_27 = op::Conv2D("conv27").set_input_x(conv_26).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_28 = op::Conv2D("conv28").set_input_x(conv_27).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_29 = op::Conv2D("conv29").set_input_x(conv_28).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_30 = op::Conv2D("conv30").set_input_x(conv_29).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_31 = op::Conv2D("conv31").set_input_x(conv_30).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_32 = op::Conv2D("conv32").set_input_x(conv_31).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_33 = op::Conv2D("conv33").set_input_x(conv_32).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_34 = op::Conv2D("conv34").set_input_x(conv_33).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_35 = op::Conv2D("conv35").set_input_x(conv_34).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_36 = op::Conv2D("conv36").set_input_x(conv_35).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_37 = op::Conv2D("conv37").set_input_x(conv_36).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_38 = op::Conv2D("conv38").set_input_x(conv_37).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_39 = op::Conv2D("conv39").set_input_x(conv_38).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_40 = op::Conv2D("conv40").set_input_x(conv_39).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_41 = op::Conv2D("conv41").set_input_x(conv_40).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_42 = op::Conv2D("conv42").set_input_x(conv_41).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_43 = op::Conv2D("conv43").set_input_x(conv_42).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_44 = op::Conv2D("conv44").set_input_x(conv_43).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_45 = op::Conv2D("conv45").set_input_x(conv_44).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_46 = op::Conv2D("conv46").set_input_x(conv_45).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_47 = op::Conv2D("conv47").set_input_x(conv_46).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_48 = op::Conv2D("conv48").set_input_x(conv_47).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_49 = op::Conv2D("conv49").set_input_x(conv_48).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_50 = op::Conv2D("conv50").set_input_x(conv_49).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_51 = op::Conv2D("conv51").set_input_x(conv_50).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_52 = op::Conv2D("conv52").set_input_x(conv_51).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_53 = op::Conv2D("conv53").set_input_x(conv_52).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_54 = op::Conv2D("conv54").set_input_x(conv_53).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_55 = op::Conv2D("conv55").set_input_x(conv_54).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_56 = op::Conv2D("conv56").set_input_x(conv_55).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_57 = op::Conv2D("conv57").set_input_x(conv_56).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_58 = op::Conv2D("conv58").set_input_x(conv_57).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_59 = op::Conv2D("conv59").set_input_x(conv_58).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_60 = op::Conv2D("conv60").set_input_x(conv_59).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_61 = op::Conv2D("conv61").set_input_x(conv_60).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_62 = op::Conv2D("conv62").set_input_x(conv_61).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_63 = op::Conv2D("conv63").set_input_x(conv_62).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_64 = op::Conv2D("conv64").set_input_x(conv_63).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_65 = op::Conv2D("conv65").set_input_x(conv_64).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_66 = op::Conv2D("conv66").set_input_x(conv_65).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_67 = op::Conv2D("conv67").set_input_x(conv_66).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_68 = op::Conv2D("conv68").set_input_x(conv_67).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_69 = op::Conv2D("conv69").set_input_x(conv_68).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_70 = op::Conv2D("conv70").set_input_x(conv_69).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_71 = op::Conv2D("conv71").set_input_x(conv_70).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_72 = op::Conv2D("conv72").set_input_x(conv_71).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_73 = op::Conv2D("conv73").set_input_x(conv_72).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_74 = op::Conv2D("conv74").set_input_x(conv_73).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_75 = op::Conv2D("conv75").set_input_x(conv_74).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_76 = op::Conv2D("conv76").set_input_x(conv_75).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_77 = op::Conv2D("conv77").set_input_x(conv_76).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_78 = op::Conv2D("conv78").set_input_x(conv_77).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_79 = op::Conv2D("conv79").set_input_x(conv_78).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_80 = op::Conv2D("conv80").set_input_x(conv_79).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_81 = op::Conv2D("conv81").set_input_x(conv_80).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_82 = op::Conv2D("conv82").set_input_x(conv_81).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_83 = op::Conv2D("conv83").set_input_x(conv_82).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_84 = op::Conv2D("conv84").set_input_x(conv_83).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_85 = op::Conv2D("conv85").set_input_x(conv_84).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_86 = op::Conv2D("conv86").set_input_x(conv_85).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_87 = op::Conv2D("conv87").set_input_x(conv_86).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_88 = op::Conv2D("conv88").set_input_x(conv_87).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_89 = op::Conv2D("conv89").set_input_x(conv_88).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_90 = op::Conv2D("conv90").set_input_x(conv_89).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_91 = op::Conv2D("conv91").set_input_x(conv_80).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_92 = op::Conv2D("conv92").set_input_x(conv_91).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_93 = op::Conv2D("conv93").set_input_x(conv_92).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_94 = op::Conv2D("conv94").set_input_x(conv_93).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_95 = op::Conv2D("conv95").set_input_x(conv_94).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_96 = op::Conv2D("conv96").set_input_x(conv_95).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_97 = op::Conv2D("conv97").set_input_x(conv_96).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_98 = op::Conv2D("conv98").set_input_x(conv_97).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_99 = op::Conv2D("conv99").set_input_x(conv_98).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_100 = op::Conv2D("conv100").set_input_x(conv_99).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_101 = op::Conv2D("conv101").set_input_x(conv_100).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_102 = op::Conv2D("conv102").set_input_x(conv_101).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_103 = op::Conv2D("conv103").set_input_x(conv_102).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_104 = op::Conv2D("conv104").set_input_x(conv_103).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_105 = op::Conv2D("conv105").set_input_x(conv_104).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_106 = op::Conv2D("conv106").set_input_x(conv_105).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_107 = op::Conv2D("conv107").set_input_x(conv_106).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_108 = op::Conv2D("conv108").set_input_x(conv_107).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_109 = op::Conv2D("conv109").set_input_x(conv_108).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_110 = op::Conv2D("conv110").set_input_x(conv_109).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_111 = op::Conv2D("conv111").set_input_x(conv_110).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_112 = op::Conv2D("conv112").set_input_x(conv_111).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_113 = op::Conv2D("conv113").set_input_x(conv_112).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_114 = op::Conv2D("conv114").set_input_x(conv_113).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_115 = op::Conv2D("conv115").set_input_x(conv_114).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_116 = op::Conv2D("conv116").set_input_x(conv_115).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_117 = op::Conv2D("conv117").set_input_x(conv_116).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_118 = op::Conv2D("conv118").set_input_x(conv_117).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_119 = op::Conv2D("conv119").set_input_x(conv_118).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_120 = op::Conv2D("conv120").set_input_x(conv_119).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_121 = op::Conv2D("conv121").set_input_x(conv_120).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_122 = op::Conv2D("conv122").set_input_x(conv_121).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_123 = op::Conv2D("conv123").set_input_x(conv_122).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_124 = op::Conv2D("conv124").set_input_x(conv_123).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_125 = op::Conv2D("conv125").set_input_x(conv_124).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_126 = op::Conv2D("conv126").set_input_x(conv_125).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_127 = op::Conv2D("conv127").set_input_x(conv_126).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_128 = op::Conv2D("conv128").set_input_x(conv_127).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_129 = op::Conv2D("conv129").set_input_x(conv_128).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); + auto conv_130 = op::Conv2D("conv130").set_input_x(conv_129).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1}); std::vector inputs{data}; std::vector outputs{conv_130}; @@ -716,7 +719,7 @@ int buildConvGraph_new(Graph &graph, std::vector desc_var, std::vect graph.AddOp(var1); graph.AddOp(label1); - auto conv2d = op::Conv2D().set_input_x(data_x_shape).set_input_filter(var).set_attr_strides({1, 1, 1, 1}); + auto conv2d = op::Conv2D().set_input_x(data_x_shape).set_input_filter(var).set_attr_strides({1, 1, 1, 1}).set_attr_pads({0,0,0,0}); update_op_format(conv2d, format); ge::TensorDesc tensor_desc_w = conv2d.GetInputDesc("filter"); tensor_desc_w.SetFormat(format); @@ -724,9 +727,9 @@ int buildConvGraph_new(Graph &graph, std::vector desc_var, std::vect if (flag >= 1) { conv2dgrad.set_input_x(data_x_shape) - .set_attr_filter_sizes(desc_var[0].GetShape().GetDims()) + .set_attr_filter_size(desc_var[0].GetShape().GetDims()) .set_input_out_backprop(conv2d) - .set_attr_strides({1, 1}) + .set_attr_strides({1, 1, 1, 1}) .set_attr_pads({0, 0, 0, 0}); update_op_format(conv2dgrad, format); graph.AddOp(conv2dgrad); diff --git a/tests/st/resnet50/resnet50_train.cc b/tests/st/resnet50/resnet50_train.cc index 4242439f..5e082df5 100644 --- a/tests/st/resnet50/resnet50_train.cc +++ b/tests/st/resnet50/resnet50_train.cc @@ -71,7 +71,8 @@ vector stride_2{2, 2}; .set_input_x(input, "y") \ .set_input_filter(LAYER##_##BLK##_##OPNUM##_weight) \ .set_attr_strides({1, 1, stride[0], stride[1]}) \ - .set_attr_pads(pad); \ + .set_attr_pads(pad) \ + .set_attr_data_format("NCHW"); \ update_op_format(LAYER##_##BLK##_##OPNUM); #define GENERATE_CONSTANT(LAYER, BLK, OPNUM, CONSTNAME) \ @@ -435,7 +436,7 @@ vector stride_2{2, 2}; auto LAYER##_##BLK##_##OPNUM##_propfilter = \ op::Conv2DBackpropFilterD(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("_propfilter")) \ .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y") \ - .set_attr_filter_sizes(LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetDims()) \ + .set_attr_filter_size(LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetDims()) \ .set_input_out_backprop(input_bngrad, input_bngrad.name_out_dx()) \ .set_attr_strides(stride) \ .set_attr_pads({1, 1, 1, 1}); \ @@ -448,14 +449,14 @@ vector stride_2{2, 2}; .set_input_momentum(label1) \ .set_input_var(LAYER##_##BLK##_##OPNUM##_weight); -///.set_attr_input_sizes({input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(0),LAYER##_##BLK##_##OPNUM##_weight.GetOutputDesc().GetShape().GetDim(1), +///.set_attr_input_size({input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(0),LAYER##_##BLK##_##OPNUM##_weight.GetOutputDesc().GetShape().GetDim(1), ///input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(2)*stride[2], ///input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(3)*stride[3]}) #define GENERATE_CONV_PROP_INPUT(LAYER, BLK, OPNUM, input_bngrad, stride) \ auto LAYER##_##BLK##_##OPNUM##_propinput = \ op::Conv2DBackpropInputD(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("_propinput")) \ - .set_attr_input_sizes(LAYER##_##BLK##_##OPNUM##_input.GetOutputDesc("y").GetShape().GetDims()) \ - .set_input_filters(LAYER##_##BLK##_##OPNUM##_weight) \ + .set_attr_input_size(LAYER##_##BLK##_##OPNUM##_input.GetOutputDesc("y").GetShape().GetDims()) \ + .set_input_filter(LAYER##_##BLK##_##OPNUM##_weight) \ .set_input_out_backprop(input_bngrad, input_bngrad.name_out_dx()) \ .set_attr_strides(stride) \ .set_attr_pads({1, 1, 1, 1}); \ @@ -564,7 +565,7 @@ bool resnet50(Graph &graph) { auto data = op::Data().set_attr_index(0); auto data1 = op::Data().set_attr_index(1); TensorDesc shape_desc(ge::Shape({32, 3, 224, 224}), FORMAT_NCHW, DT_FLOAT); - data.update_output_desc_out(shape_desc); + data.update_output_desc_y(shape_desc); TensorDesc desc(ge::Shape({64, 3, 7, 7}), FORMAT_NCHW, DT_FLOAT); @@ -579,7 +580,8 @@ bool resnet50(Graph &graph) { .set_input_x(data) .set_input_filter(var) .set_attr_strides({1, 1, 2, 2}) - .set_attr_pads({2, 3, 2, 3}); + .set_attr_pads({2, 3, 2, 3}) + .set_attr_data_format("NCHW"); TensorDesc desc_y; desc_y.SetFormat(FORMAT_NCHW); // shape: 32 64 112 112 conv2d.update_output_desc_y(desc_y); diff --git a/tests/ut/common/graph/CMakeLists.txt b/tests/ut/common/graph/CMakeLists.txt index cda1f1e1..064abc16 100644 --- a/tests/ut/common/graph/CMakeLists.txt +++ b/tests/ut/common/graph/CMakeLists.txt @@ -37,6 +37,7 @@ include_directories(${GE_SOURCE_DIR}/inc/graph) include_directories(${GE_SOURCE_DIR}/inc/common) include_directories(${GE_SOURCE_DIR}/third_party/securec/include) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc) +include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops) include_directories(${CMAKE_BINARY_DIR}) include_directories(${CMAKE_BINARY_DIR}/proto/ge) diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc index 291fcbfa..372a673e 100644 --- a/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc +++ b/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc @@ -371,7 +371,7 @@ TEST(UtestGeModelSerialize, simple) { AttrUtils::GetTensorDesc(s_op, "node_key7", s_tensor_desc); EXPECT_EQ(s_tensor_desc.GetFormat(), FORMAT_NCHW); EXPECT_EQ(s_tensor_desc.GetDataType(), DT_INT16); - uint32_t size = 0; + int64_t size = 0; TensorUtils::GetSize(s_tensor_desc, size); EXPECT_EQ(size, 100); diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_node_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_node_unittest.cc index be0fe27d..f75e85fa 100644 --- a/tests/ut/common/graph/testcase/ge_graph/ge_node_unittest.cc +++ b/tests/ut/common/graph/testcase/ge_graph/ge_node_unittest.cc @@ -167,7 +167,9 @@ TEST_F(UtestGeNode, add_link_from) { NodePtr n5 = graph_ptr->AddNode(desc_ptr); EXPECT_EQ(n3->AddLinkFrom("x", n4), GRAPH_SUCCESS); EXPECT_EQ(n3->AddLinkFrom(0, n5), GRAPH_SUCCESS); - desc_ptr->input_name_idx_.insert(make_pair("__input1", 1)); + auto input_name_idx = desc_ptr->GetAllInputName(); + input_name_idx.insert(make_pair("__input1", 1)); + desc_ptr->SetAllInputName(input_name_idx); EXPECT_EQ(n2->AddLinkFrom(n1), GRAPH_SUCCESS); OpDescPtr desc_ptr1 = std::make_shared("name1", "type1"); diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc index a4af8399..6d34ab59 100644 --- a/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc +++ b/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc @@ -125,7 +125,7 @@ TEST_F(UtestGeTensor, tensor_desc) { GeTensorDesc d; d = c.Clone(); GeTensorDesc e = c; - uint32_t size2 = 0; + int64_t size2 = 0; EXPECT_EQ(TensorUtils::GetSize(e, size2), GRAPH_SUCCESS); EXPECT_EQ(size2, 1); @@ -344,35 +344,35 @@ TEST_F(UtestGeTensor, test_tensor_invalid) { TensorDesc tensor_desc(shape); std::vector data({1, 2, 3, 4, 5}); Tensor tensor1(tensor_desc, data); - EXPECT_EQ(tensor1.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor1.IsValid(), GRAPH_FAILED); // Tensor(const TensorDesc &tensor_desc, const uint8_t *data, size_t size) TensorDesc tensor_desc2(Shape({3, 3, 3}), FORMAT_NCHW, DT_FLOAT); uint32_t size2 = 3 * 3 * 3; uint8_t data2[3 * 3 * 3] = {0}; Tensor tensor2(tensor_desc2, data2, size2); - EXPECT_EQ(tensor2.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor2.IsValid(), GRAPH_FAILED); // Tensor(TensorDesc &&tensor_desc, std::vector &&data) Tensor tensor3(std::move(tensor_desc), std::move(data)); - EXPECT_EQ(tensor3.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor3.IsValid(), GRAPH_FAILED); // Tensor() Tensor tensor4; tensor4.SetTensorDesc(tensor_desc); - EXPECT_EQ(tensor4.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor4.IsValid(), GRAPH_FAILED); tensor4.SetData(data); - EXPECT_EQ(tensor4.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor4.IsValid(), GRAPH_FAILED); Tensor tensor5; tensor5.SetData(data); - EXPECT_EQ(tensor5.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor5.IsValid(), GRAPH_FAILED); tensor5.SetTensorDesc(tensor_desc); - EXPECT_EQ(tensor5.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor5.IsValid(), GRAPH_FAILED); // scalar TensorDesc tensor_desc6(Shape(), FORMAT_NCHW, DT_FLOAT); uint8_t data6 = 2; Tensor tensor6(tensor_desc6, &data6, 1); - EXPECT_EQ(tensor6.IsValid(), GRAPH_SUCCESS); + EXPECT_EQ(tensor6.IsValid(), GRAPH_FAILED); } diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index 8ac71939..5ed130c7 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -46,6 +46,7 @@ include_directories(${GE_SOURCE_DIR}/inc/common) include_directories(${GE_SOURCE_DIR}/third_party/securec/include) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce) +include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops) include_directories(${GE_SOURCE_DIR}/tests/ut/ge) include_directories(${CMAKE_BINARY_DIR}) include_directories(${CMAKE_BINARY_DIR}/proto/ge) @@ -100,6 +101,8 @@ file(GLOB_RECURSE COMMON_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${GE_SOURCE_DIR}/src/common/graph/opsproto/opsproto_manager.cc" "${GE_SOURCE_DIR}/src/common/graph/op_imp.cc" "${GE_SOURCE_DIR}/src/common/register/register.cc" + "${GE_SOURCE_DIR}/src/common/register/op_kernel_registry.cc" + "${GE_SOURCE_DIR}/tests/depends/cce/src/op_kernel_registry.cc" "${GE_SOURCE_DIR}/src/common/register/auto_mapping_util.cc" "${GE_SOURCE_DIR}/src/common/register/tensor_assign.cc" "${GE_SOURCE_DIR}/src/common/graph/format_refiner.cc" @@ -164,6 +167,7 @@ file(GLOB_RECURSE DISTINCT_GRAPH_LOAD_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${GE_SOURCE_DIR}/src/ge/common/model_parser/base.cc" "${GE_SOURCE_DIR}/src/ge/common/tbe_kernel_store.cc" "${GE_SOURCE_DIR}/src/ge/common/util.cc" + "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc" "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/data_dumper.cc" "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/data_inputer.cc" "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/davinci_model.cc" @@ -187,6 +191,8 @@ file(GLOB_RECURSE DISTINCT_GRAPH_LOAD_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc" "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc" "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc" + "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc" + "${GE_SOURCE_DIR}/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc" "${GE_SOURCE_DIR}/src/ge/graph/load/output/output.cc" "${GE_SOURCE_DIR}/src/ge/model/ge_model.cc" "${GE_SOURCE_DIR}/src/ge/common/helper/model_helper.cc" @@ -294,6 +300,7 @@ file(GLOB_RECURSE GRAPH_PASS_COMMON_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR} "${GE_SOURCE_DIR}/src/ge/graph/passes/no_use_reshape_remove_pass.cc" "${GE_SOURCE_DIR}/src/ge/graph/passes/control_op_attr_pass.cc" "${GE_SOURCE_DIR}/src/ge/graph/passes/infershape_pass.cc" + "${GE_SOURCE_DIR}/src/ge/ge_local_engine/engine/host_cpu_engine.cc" ) file(GLOB_RECURSE KERNEL_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR} @@ -402,8 +409,6 @@ file(GLOB_RECURSE PASS_TEST_FILES ${CMAKE_CURRENT_SOURCE_DIR} "graph/passes/dimension_adjust_pass_unittest.cc" "graph/passes/pass_utils_unittest.cc" "graph/passes/net_output_pass_unittest.cc" - "graph/passes/update_net_output_pass_unittest.cc" - "graph/passes/no_reshape_op_remove_pass_unittest.cc" "graph/passes/no_use_reshape_remove_pass_unittest.cc" "graph/passes/infershape_pass_unittest.cc" ) diff --git a/tests/ut/ge/graph/ge_executor_unittest.cc b/tests/ut/ge/graph/ge_executor_unittest.cc index 7f5388ad..b1972b6c 100644 --- a/tests/ut/ge/graph/ge_executor_unittest.cc +++ b/tests/ut/ge/graph/ge_executor_unittest.cc @@ -70,11 +70,11 @@ class UtestGeExecutor : public testing::Test { TEST_F(UtestGeExecutor, fail_UnloadModel_model_manager_stop_unload_error) { uint32_t model_id = 1; ge::GeExecutor ge_executor; - ge_executor.is_init_ = true; + ge_executor.isInit_ = true; ge::Status ret = ge_executor.UnloadModel(model_id); EXPECT_EQ(ge::PARAM_INVALID, ret); - ge_executor.is_init_ = false; + ge_executor.isInit_ = false; ret = ge_executor.UnloadModel(model_id); EXPECT_EQ(ge::GE_EXEC_NOT_INIT, ret); } diff --git a/tests/ut/ge/graph/graph_load_unittest.cc b/tests/ut/ge/graph/graph_load_unittest.cc index e516ec7a..af9d5a37 100644 --- a/tests/ut/ge/graph/graph_load_unittest.cc +++ b/tests/ut/ge/graph/graph_load_unittest.cc @@ -73,9 +73,7 @@ TEST_F(UtestGraphGraphLoad, load_graph_param_invalid1) { TEST_F(UtestGraphGraphLoad, load_graph_param_invalid2) { std::mutex sync_run_mutex; std::condition_variable condition; - std::shared_ptr listener = std::make_shared(); - listener->mutex_ = &sync_run_mutex; - listener->condition_ = &condition; + std::shared_ptr listener = std::make_shared(sync_run_mutex, condition); SubGraphInfo sub_graph1; ge::SubGraphInfoPtr sub_graph_ptr1 = std::make_shared(sub_graph1); diff --git a/tests/ut/ge/graph/load/end_graph_task_unittest.cc b/tests/ut/ge/graph/load/end_graph_task_unittest.cc index dedb2dac..29e7a53a 100644 --- a/tests/ut/ge/graph/load/end_graph_task_unittest.cc +++ b/tests/ut/ge/graph/load/end_graph_task_unittest.cc @@ -47,9 +47,4 @@ TEST_F(UtestEndGraphTask, init_end_graph_task_info) { EXPECT_EQ(task_info.Init(task_def, &model), SUCCESS); model.stream_list_.clear(); } - -TEST_F(UtestEndGraphTask, distribute_success) { - EndGraphTaskInfo task_info; - EXPECT_EQ(task_info.Distribute(), SUCCESS); -} } // namespace ge diff --git a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc index c91ca44b..f8deff7f 100644 --- a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc +++ b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc @@ -149,7 +149,7 @@ TEST_F(UtestModelManagerDavinciModel, failed_rt_free_host) { model.data_op_list_.push_back(op_desc); - EXPECT_EQ(ge::INTERNAL_ERROR, model.ReturnResult(1, 1, false, false, &output_data)); + EXPECT_EQ(ge::INTERNAL_ERROR, model.ReturnResult(1, false, false, &output_data)); } // test modeldef_fail @@ -274,7 +274,7 @@ TEST_F(UtestModelManagerDavinciModel, failed_reset_device) { rtMalloc(&buf_data.data, 128, RT_MEMORY_HBM); buf_data.length = 128; output_data.blobs.push_back(buf_data); - EXPECT_EQ(ge::INTERNAL_ERROR, model.ReturnResult(1, 1, true, false, &output_data)); + EXPECT_EQ(ge::INTERNAL_ERROR, model.ReturnResult(1, true, false, &output_data)); rtFree(buf_data.data); } @@ -1222,7 +1222,7 @@ TEST_F(UtestModelManagerDavinciModel, success_output_list_0) { Status ret = VarManager::Instance(session_id)->Init(version, session_id, device_id, job_id); EXPECT_EQ(ret, ge::SUCCESS); - ret = model.ReturnNoOutput(1, 1); + ret = model.ReturnNoOutput(1); EXPECT_EQ(ret, ge::SUCCESS); VarManagerPool::Instance().Destroy(); diff --git a/tests/ut/ge/graph/load/output_net_output_unittest.cc b/tests/ut/ge/graph/load/output_net_output_unittest.cc index 7897378e..52fdebfa 100644 --- a/tests/ut/ge/graph/load/output_net_output_unittest.cc +++ b/tests/ut/ge/graph/load/output_net_output_unittest.cc @@ -94,7 +94,7 @@ TEST_F(UtestNetOutput, test_get_input_size) { builder.AddOutput(1); builder.Finish(); - vector v_output_size = ModelUtils::GetInputSize(custom_op_desc); + vector v_output_size = ModelUtils::GetInputSize(custom_op_desc); EXPECT_EQ(v_output_size.size(), 1); } @@ -152,7 +152,7 @@ TEST_F(UtestNetOutput, success_is_output_tensor_need_trans) { // test ModelUtils::GetOutputSize TEST_F(UtestNetOutput, success_get_output_size) { - vector v_output_size; + vector v_output_size; ModelUtils *model_utils = new ModelUtils(); std::shared_ptr op_desc = std::make_shared(); @@ -172,7 +172,7 @@ TEST_F(UtestNetOutput, success_get_output_size) { // test ModelUtils::GetWorkspaceSize TEST_F(UtestNetOutput, success_get_workspace_size) { - vector v_workspace_size; + vector v_workspace_size; ModelUtils *model_utils = new ModelUtils(); std::shared_ptr op_desc = std::make_shared(); @@ -188,7 +188,7 @@ TEST_F(UtestNetOutput, success_get_workspace_size) { // test ModelUtils::GetWeightSize TEST_F(UtestNetOutput, success_get_weight_size) { - vector v_weight_size; + vector v_weight_size; ModelUtils *model_utils = new ModelUtils(); std::shared_ptr op_desc = std::make_shared(); @@ -292,7 +292,7 @@ TEST_F(UtestNetOutput, success_get_output_data) { output->input_num_ = 1; vector v_data_addr; - vector v_data_size; + vector v_data_size; output->GetOutputData(v_data_addr, v_data_size); EXPECT_EQ(output->v_input_data_addr_, v_data_addr); diff --git a/tests/ut/ge/graph/passes/addn_pass_unittest.cc b/tests/ut/ge/graph/passes/addn_pass_unittest.cc index e31030b2..6107a7d8 100644 --- a/tests/ut/ge/graph/passes/addn_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/addn_pass_unittest.cc @@ -129,7 +129,7 @@ TEST(UtestGraphPassesAddnPass, no_output) { AddNPass addn_pass; NamesToPass names_to_pass; names_to_pass.emplace_back("Test", &addn_pass); - EXPECT_EQ(pass.Run(names_to_pass), INTERNAL_ERROR); + EXPECT_NE(pass.Run(names_to_pass), SUCCESS); EXPECT_FALSE(add_n_node->GetInDataNodes().empty()); EXPECT_TRUE(add_n_node->GetOutDataNodes().empty()); diff --git a/tests/ut/ge/graph/passes/folding_kernel/dynamic_stitch_kernel_unittest.cc b/tests/ut/ge/graph/passes/folding_kernel/dynamic_stitch_kernel_unittest.cc index 7b51b893..cd6fc974 100644 --- a/tests/ut/ge/graph/passes/folding_kernel/dynamic_stitch_kernel_unittest.cc +++ b/tests/ut/ge/graph/passes/folding_kernel/dynamic_stitch_kernel_unittest.cc @@ -49,7 +49,7 @@ class UtestGraphPassesFoldingKernelDynamicStitchKernel : public testing::Test { TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, IndiceFloatSuccess) { OpDescPtr op_desc_ptr = std::make_shared("dynamicstitch", "DynamicStitch"); - AttrUtils::SetInt(op_desc_ptr, "DynamicStitchN_", (int64_t)2); + AttrUtils::SetInt(op_desc_ptr, "N", (int64_t)2); vector is_input_const_vec = {true, true, true, true}; op_desc_ptr->SetIsInputConst(is_input_const_vec); @@ -81,6 +81,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, IndiceFloatSuccess) { op_desc_ptr->AddInputDesc(tensor_desc_1); op_desc_ptr->AddInputDesc(tensor_desc_2); op_desc_ptr->AddInputDesc(tensor_desc_3); + op_desc_ptr->AddOutputDesc(tensor_desc_3); vector input = {tensor_0, tensor_1, tensor_2, tensor_3}; vector outputs; @@ -99,7 +100,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, IndiceFloatSuccess) { TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, ScalerIndiceDoubleSuccess) { OpDescPtr op_desc_ptr = std::make_shared("dynamicstitch", "DynamicStitch"); - AttrUtils::SetInt(op_desc_ptr, "DynamicStitchN_", (int64_t)2); + AttrUtils::SetInt(op_desc_ptr, "N", (int64_t)2); vector is_input_const_vec = {true, true, true, true}; op_desc_ptr->SetIsInputConst(is_input_const_vec); @@ -131,6 +132,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, ScalerIndiceDoubleSucce op_desc_ptr->AddInputDesc(tensor_desc_1); op_desc_ptr->AddInputDesc(tensor_desc_2); op_desc_ptr->AddInputDesc(tensor_desc_3); + op_desc_ptr->AddOutputDesc(tensor_desc_3); vector input = {tensor_0, tensor_1, tensor_2, tensor_3}; vector outputs; @@ -148,7 +150,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, ScalerIndiceDoubleSucce TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, UnsupportedDataType) { OpDescPtr op_desc_ptr = std::make_shared("dynamicstitch", "DynamicStitch"); - AttrUtils::SetInt(op_desc_ptr, "DynamicStitchN_", (int64_t)2); + AttrUtils::SetInt(op_desc_ptr, "N", (int64_t)2); vector is_input_const_vec = {true, true, true, true}; op_desc_ptr->SetIsInputConst(is_input_const_vec); @@ -180,6 +182,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, UnsupportedDataType) { op_desc_ptr->AddInputDesc(tensor_desc_1); op_desc_ptr->AddInputDesc(tensor_desc_2); op_desc_ptr->AddInputDesc(tensor_desc_3); + op_desc_ptr->AddOutputDesc(tensor_desc_3); vector input = {tensor_0, tensor_1, tensor_2, tensor_3}; vector outputs; @@ -196,9 +199,14 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, ValidateParamFail) { OpDescPtr op_desc_ptr = nullptr; shared_ptr kernel = KernelFactory::Instance().Create(DYNAMICSTITCH); Status status = kernel->Compute(nullptr, empty_input, empty_output); - EXPECT_EQ(status, ge::PARAM_INVALID); + EXPECT_EQ(status, NOT_CHANGED); + // outputdesc is null + op_desc_ptr = make_shared("dynamicstitch", "DynamicStitch"); + status = kernel->Compute(op_desc_ptr, empty_input, empty_output); + EXPECT_EQ(status, NOT_CHANGED); // input is empty op_desc_ptr = std::make_shared("dynamicstitch", "DynamicStitch"); + op_desc_ptr->AddOutputDesc(GeTensorDesc()); status = kernel->Compute(op_desc_ptr, empty_input, empty_output); EXPECT_EQ(status, NOT_CHANGED); @@ -219,14 +227,14 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, ValidateParamFail) { status = kernel->Compute(op_desc_ptr, input, empty_output); EXPECT_EQ(status, NOT_CHANGED); - AttrUtils::SetInt(op_desc_ptr, "DynamicStitchN_", (int64_t)4); + AttrUtils::SetInt(op_desc_ptr, "N", (int64_t)4); status = kernel->Compute(op_desc_ptr, input, empty_output); EXPECT_EQ(status, NOT_CHANGED); } TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, RepeatedIndiceInt32Success) { OpDescPtr op_desc_ptr = std::make_shared("dynamicstitch", "DynamicStitch"); - AttrUtils::SetInt(op_desc_ptr, "DynamicStitchN_", (int64_t)2); + AttrUtils::SetInt(op_desc_ptr, "N", (int64_t)2); vector is_input_const_vec = {true, true, true, true}; op_desc_ptr->SetIsInputConst(is_input_const_vec); @@ -258,6 +266,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, RepeatedIndiceInt32Succ op_desc_ptr->AddInputDesc(tensor_desc_1); op_desc_ptr->AddInputDesc(tensor_desc_2); op_desc_ptr->AddInputDesc(tensor_desc_3); + op_desc_ptr->AddOutputDesc(tensor_desc_3); vector input = {tensor_0, tensor_1, tensor_2, tensor_3}; vector outputs; @@ -274,7 +283,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, RepeatedIndiceInt32Succ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, RepeatedIndiceInt64Success) { OpDescPtr op_desc_ptr = std::make_shared("dynamicstitch", "DynamicStitch"); - AttrUtils::SetInt(op_desc_ptr, "DynamicStitchN_", (int64_t)2); + AttrUtils::SetInt(op_desc_ptr, "N", (int64_t)2); vector is_input_const_vec = {true, true, true, true}; op_desc_ptr->SetIsInputConst(is_input_const_vec); @@ -306,6 +315,7 @@ TEST_F(UtestGraphPassesFoldingKernelDynamicStitchKernel, RepeatedIndiceInt64Succ op_desc_ptr->AddInputDesc(tensor_desc_1); op_desc_ptr->AddInputDesc(tensor_desc_2); op_desc_ptr->AddInputDesc(tensor_desc_3); + op_desc_ptr->AddOutputDesc(tensor_desc_3); vector input = {tensor_0, tensor_1, tensor_2, tensor_3}; vector outputs; diff --git a/tests/ut/ge/graph/passes/folding_kernel/mul_kernel_unittest.cc b/tests/ut/ge/graph/passes/folding_kernel/mul_kernel_unittest.cc index 8c13c58d..7cb5ad2d 100644 --- a/tests/ut/ge/graph/passes/folding_kernel/mul_kernel_unittest.cc +++ b/tests/ut/ge/graph/passes/folding_kernel/mul_kernel_unittest.cc @@ -93,7 +93,12 @@ TEST_F(UtestGraphPassesFoldingKernelMulKernel, DoubleNotchanged) { shared_ptr kernel = KernelFactory::Instance().Create(MUL); Status status = kernel->Compute(op_desc_ptr, input, outputs); - EXPECT_EQ(NOT_CHANGED, status); + const double *output_data = reinterpret_cast(outputs[0]->GetData().data()); + double diff = output_data[0] - (15.0); + bool is_same = fabs(diff) < 0.00001 ? true : false; + + EXPECT_EQ(is_same, true); + EXPECT_EQ(domi::SUCCESS, status); } TEST_F(UtestGraphPassesFoldingKernelMulKernel, MulOverflow) { diff --git a/tests/ut/ge/graph/passes/no_reshape_op_remove_pass_unittest.cc b/tests/ut/ge/graph/passes/no_reshape_op_remove_pass_unittest.cc deleted file mode 100644 index 847daa94..00000000 --- a/tests/ut/ge/graph/passes/no_reshape_op_remove_pass_unittest.cc +++ /dev/null @@ -1,204 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "graph/passes/no_reshape_op_remove_pass.h" - -#include - -#include "common/ge_inner_error_codes.h" -#include "graph/compute_graph.h" -#include "graph/debug/graph_debug.h" -#include "graph/manager/graph_manager.h" -#include "graph/manager/graph_manager_utils.h" -#include "graph/op_desc.h" -#include "graph/operator_reg.h" -#include "graph/utils/op_desc_utils.h" -#include "graph_builder_utils.h" -#include "inc/pass_manager.h" -#include "opskernel_manager/ops_kernel_manager.h" - -using namespace std; -using namespace testing; -using namespace ge; - -class UtestGraphNoReshapeOpRemovePass : public testing::Test { - protected: - void SetUp() {} - void TearDown() {} -}; - -class NodeBuilder { - public: - NodeBuilder(const std::string &name, const std::string &type) { op_desc_ = std::make_shared(name, type); } - NodeBuilder &AddInputDesc(std::initializer_list shape, ge::Format format = FORMAT_NCHW, - ge::DataType data_type = DT_FLOAT) { - op_desc_->AddInputDesc(CreateTensorDesc(shape, format, data_type)->Clone()); - return *this; - } - NodeBuilder &AddOutputDesc(std::initializer_list shape, ge::Format format = FORMAT_NCHW, - ge::DataType data_type = DT_FLOAT) { - op_desc_->AddOutputDesc(CreateTensorDesc(shape, format, data_type)->Clone()); - return *this; - } - ge::NodePtr Build(const ge::ComputeGraphPtr &graph) { return graph->AddNode(op_desc_); } - - private: - ge::GeTensorDescPtr CreateTensorDesc(std::initializer_list shape, ge::Format format = FORMAT_NCHW, - ge::DataType data_type = DT_FLOAT) { - GeShape ge_shape{std::vector(shape)}; - ge::GeTensorDescPtr tensor_desc = std::make_shared(); - tensor_desc->SetShape(ge_shape); - tensor_desc->SetFormat(format); - tensor_desc->SetDataType(data_type); - return tensor_desc; - } - ge::OpDescPtr op_desc_; -}; - -/// data->expanddim->reshape1->reshape2->reshape3->squeeze->reshape4->sinh -/// / -/// const -void make_graph(ComputeGraphPtr &graph) { - ge::NodePtr node_data = NodeBuilder("Data4D", DATA).AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT).Build(graph); - - ge::NodePtr node_expanddim_1 = NodeBuilder("ExpandDim", EXPANDDIMS) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - - ge::NodePtr node_reshape_1 = NodeBuilder("Reshape_1", RESHAPE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 1, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - - ge::NodePtr node_reshape_2 = NodeBuilder("Reshape_2", RESHAPE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 1, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - - ge::NodePtr node_reshape_3 = NodeBuilder("Reshape_3", RESHAPE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - - ge::NodePtr node_squeeze_1 = NodeBuilder("Squeeze", SQUEEZE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 1, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - ge::NodePtr node_const = - NodeBuilder("const", CONSTANT).AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT).Build(graph); - - ge::NodePtr node_reshape_4 = NodeBuilder("Reshape_4", RESHAPE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - - ge::NodePtr node_sinh_1 = NodeBuilder("sinh", SINH) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 1, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - - GraphUtils::AddEdge(node_data->GetOutDataAnchor(0), node_expanddim_1->GetInDataAnchor(0)); - GraphUtils::AddEdge(node_expanddim_1->GetOutDataAnchor(0), node_reshape_1->GetInDataAnchor(0)); - GraphUtils::AddEdge(node_reshape_1->GetOutDataAnchor(0), node_reshape_2->GetInDataAnchor(0)); - GraphUtils::AddEdge(node_reshape_2->GetOutDataAnchor(0), node_reshape_3->GetInDataAnchor(0)); - GraphUtils::AddEdge(node_reshape_3->GetOutDataAnchor(0), node_squeeze_1->GetInDataAnchor(0)); - GraphUtils::AddEdge(node_squeeze_1->GetOutDataAnchor(0), node_reshape_4->GetInDataAnchor(0)); - GraphUtils::AddEdge(node_const->GetOutDataAnchor(0), node_reshape_4->GetInDataAnchor(1)); - GraphUtils::AddEdge(node_reshape_4->GetOutDataAnchor(0), node_sinh_1->GetInDataAnchor(0)); -} - -// reshape->permute->transdata->correlation -void make_graph_for_sfc(ComputeGraphPtr &graph) { - // Node4D - ge::NodePtr node_data = NodeBuilder("Data4D", DATA).AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT).Build(graph); - - // reshape1 - ge::NodePtr node_reshape_1 = NodeBuilder("Reshape_3", RESHAPE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 1, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - // permute - ge::NodePtr node_permute_1 = NodeBuilder("permute", PERMUTE) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - // transdata - ge::NodePtr node_transdata_1 = NodeBuilder("transdata", TRANSDATA) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - // transdata - ge::NodePtr node_correlation_1 = NodeBuilder("correlation", CORRELATION) - .AddInputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT) - .AddOutputDesc({2, 2, 2, 2}, FORMAT_NCHW, DT_FLOAT16) - .Build(graph); - // add edge - ge::GraphUtils::AddEdge(node_data->GetOutDataAnchor(0), node_reshape_1->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_reshape_1->GetOutDataAnchor(0), node_permute_1->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_permute_1->GetOutDataAnchor(0), node_transdata_1->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_transdata_1->GetOutDataAnchor(0), node_correlation_1->GetInDataAnchor(0)); -} - -TEST_F(UtestGraphNoReshapeOpRemovePass, node_to_be_delete_success) { - ge::ComputeGraphPtr compute_graph = std::make_shared("test"); - make_graph(compute_graph); - - NoReshapeOpRemovePass noreshapepass; - ge::NodePtr expandDim1 = compute_graph->FindNode("ExpandDim"); - Status status = noreshapepass.Run(expandDim1); - EXPECT_EQ(status, ge::SUCCESS); - expandDim1 = compute_graph->FindNode("ExpandDim"); - EXPECT_EQ(expandDim1, nullptr); - - ge::NodePtr reshape1 = compute_graph->FindNode("Reshape_1"); - status = noreshapepass.Run(reshape1); - EXPECT_EQ(status, ge::SUCCESS); - reshape1 = compute_graph->FindNode("Reshape_1"); - EXPECT_EQ(reshape1, nullptr); - - ge::NodePtr reshape2 = compute_graph->FindNode("Reshape_2"); - EXPECT_EQ(reshape2, nullptr); - - ge::NodePtr reshape3 = compute_graph->FindNode("Reshape_3"); - EXPECT_EQ(reshape3, nullptr); - - ge::NodePtr reshape4 = compute_graph->FindNode("Reshape_4"); - status = noreshapepass.Run(reshape4); - EXPECT_EQ(status, ge::SUCCESS); - reshape4 = compute_graph->FindNode("Reshape_4"); - EXPECT_EQ(reshape4, nullptr); - - ge::NodePtr const1 = compute_graph->FindNode("const"); - auto output_size = const1->GetOutDataNodes().size(); - EXPECT_EQ(output_size, 0); - ge::NodePtr sinh1 = compute_graph->FindNode("sinh"); - auto input_size = sinh1->GetInDataNodes().size(); - EXPECT_EQ(input_size, 1); -} -TEST_F(UtestGraphNoReshapeOpRemovePass, reshape_for_sfc_net_success) { - ge::ComputeGraphPtr graph = std::make_shared("test"); - make_graph_for_sfc(graph); - NoReshapeOpRemovePass noreshapepass; - - NodePtr reshape_node = graph->FindNode("Reshape_3"); - noreshapepass.Run(reshape_node); - NodePtr permute_node = graph->FindNode("permute"); - bool flag = false; - AttrUtils::GetBool(permute_node->GetOpDesc(), "reshape_correlation", flag); - EXPECT_EQ(flag, true); -} diff --git a/tests/ut/ge/graph/passes/transop_nearby_allreduce_fusion_pass_unittest.cc b/tests/ut/ge/graph/passes/transop_nearby_allreduce_fusion_pass_unittest.cc index 0e144432..1220b35e 100644 --- a/tests/ut/ge/graph/passes/transop_nearby_allreduce_fusion_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/transop_nearby_allreduce_fusion_pass_unittest.cc @@ -365,7 +365,7 @@ TEST(UtestTransopNearbyAllreduceFusionPass, test8_in_and_out_data_anchor_are_not NamesToPass names_to_pass; names_to_pass.emplace_back("TransOpNearbyAllreduceFusionPass", &transop_nearby_allreduce_fusion_pass); Status ret = ge_pass.Run(names_to_pass); - EXPECT_EQ(ret, INTERNAL_ERROR); + EXPECT_EQ(ret, FAILED); } } // namespace diff --git a/tests/ut/ge/graph/passes/update_net_output_pass_unittest.cc b/tests/ut/ge/graph/passes/update_net_output_pass_unittest.cc deleted file mode 100644 index 78186344..00000000 --- a/tests/ut/ge/graph/passes/update_net_output_pass_unittest.cc +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#define protected public -#define private public -#include "graph/passes/update_net_output_pass.h" - -#include "common/op/ge_op_utils.h" -#include "common/types.h" -#include "graph/anchor.h" -#include "graph/attr_value.h" -#include "graph/compute_graph.h" -#include "graph/op_desc.h" -#include "graph/utils/attr_utils.h" -#include "graph/utils/graph_utils.h" -#include "graph/utils/op_desc_utils.h" -#include "graph/utils/tensor_utils.h" -#include "graph_builder_utils.h" -#include "omg/omg_inner_types.h" -#undef protected -#undef private - -using namespace testing; -namespace ge { -class UtestNodePassesUpdateNetoutputPass : public Test { - protected: - UtestNodePassesUpdateNetoutputPass() = default; -}; - -namespace { -/// net_output1 -/// | -/// addn -/// / \ -/// / \ -/// const1 const2 -ComputeGraphPtr BuildGraph1() { - auto builder = ut::GraphBuilder("test"); - auto const1 = builder.AddNode("const1", CONSTANT, 0, 1); - auto const2 = builder.AddNode("const2", CONSTANT, 0, 1); - auto addn1 = builder.AddNode("addn1", ADDN, 2, 1); - auto net_output1 = builder.AddNode("net_output", NETOUTPUT, 1, 1); - - builder.AddDataEdge(const1, 0, addn1, 0); - builder.AddDataEdge(const2, 0, addn1, 1); - builder.AddDataEdge(addn1, 0, net_output1, 0); - return builder.GetGraph(); -} -} // namespace - -TEST_F(UtestNodePassesUpdateNetoutputPass, update_netoutput_succ) { - auto graph = BuildGraph1(); - auto net_output = graph->FindNode("net_output"); - EXPECT_NE(net_output, nullptr); - - auto tensor = net_output->GetOpDesc()->GetOutputDesc(0); - EXPECT_EQ(tensor.GetDataType(), DT_FLOAT); - EXPECT_EQ(tensor.GetFormat(), FORMAT_NCHW); - - ge::NodePtr node = nullptr; - ReUpdateNetOutputPass re_update_net_output_pass; - Status status = re_update_net_output_pass.Run(node); - EXPECT_EQ(FAILED, status); - - status = re_update_net_output_pass.Run(net_output); - EXPECT_EQ(SUCCESS, status); - - domi::GetContext().output_type = "FP17"; - status = re_update_net_output_pass.Run(net_output); - EXPECT_EQ(SUCCESS, status); - - domi::GetContext().output_type = "FP16"; - status = re_update_net_output_pass.Run(net_output); - EXPECT_EQ(SUCCESS, status); - auto in_desc = net_output->GetOpDesc()->GetInputDesc(0); - EXPECT_EQ(in_desc.GetDataType(), DT_FLOAT16); - auto out_desc = net_output->GetOpDesc()->GetOutputDesc(0); - EXPECT_EQ(out_desc.GetDataType(), DT_FLOAT16); -} -} // namespace ge diff --git a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc index 6081ca43..5027c988 100644 --- a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc +++ b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc @@ -84,13 +84,13 @@ TEST_F(UtestGeProfilinganager, start_profiling_success) { Status ret = ProfilingManager::Instance().Init(options); EXPECT_EQ(ret, ge::SUCCESS); - ret = ProfilingManager::Instance().StartProfiling(iter_num); + ret = ProfilingManager::Instance().StartProfiling(iter_num, 0); EXPECT_EQ(ret, ge::SUCCESS); setenv("PROFILING_OPTIONS", "op_trance", true); ret = ProfilingManager::Instance().Init(options); EXPECT_EQ(ret, ge::SUCCESS); - ret = ProfilingManager::Instance().StartProfiling(iter_num); + ret = ProfilingManager::Instance().StartProfiling(iter_num, 0); EXPECT_EQ(ret, ge::SUCCESS); } @@ -107,18 +107,11 @@ TEST_F(UtestGeProfilinganager, stop_profiling_success) { setenv("PROFILING_OPTIONS", "op_trance", true); ret = ProfilingManager::Instance().Init(options); EXPECT_EQ(ret, ge::SUCCESS); - ret = ProfilingManager::Instance().StartProfiling(iter_num); + ret = ProfilingManager::Instance().StartProfiling(iter_num, 0); EXPECT_EQ(ret, ge::SUCCESS); ProfilingManager::Instance().StopProfiling(); } -TEST_F(UtestGeProfilinganager, report_profiling_data_success) { - map op_task_id_map; - op_task_id_map[0] = "conv"; - op_task_id_map.insert(pair(1, "mul")); - ProfilingManager::Instance().ReportProfilingData(op_task_id_map); -} - TEST_F(UtestGeProfilinganager, plugin_impl_success) { PluginImpl plugin_Impl("FMK"); TestReporter test_reporter; diff --git a/third_party/fwkacllib/inc/aicpu/common/aicpu_task_struct.h b/third_party/fwkacllib/inc/aicpu/common/aicpu_task_struct.h index bb1f63a9..d058ead2 100644 --- a/third_party/fwkacllib/inc/aicpu/common/aicpu_task_struct.h +++ b/third_party/fwkacllib/inc/aicpu/common/aicpu_task_struct.h @@ -21,12 +21,13 @@ namespace aicpu { -struct AicpuParamHead { +struct AicpuParamHead +{ uint32_t length; // Total length: include cunstom message uint32_t ioAddrNum; // Input and output address number } __attribute__ ((packed)); -} // namespace aicpu +} // namespace aicpu #endif // AICPU_TASK_STRUCT_H_ diff --git a/third_party/fwkacllib/inc/cce/cce_def.hpp b/third_party/fwkacllib/inc/cce/cce_def.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/common/attr_list.hpp b/third_party/fwkacllib/inc/cce/common/attr_list.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/common/catch.hpp b/third_party/fwkacllib/inc/cce/common/catch.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/dnn_base_def.hpp b/third_party/fwkacllib/inc/cce/dnn_base_def.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/dnn_op.h b/third_party/fwkacllib/inc/cce/dnn_op.h index 7895066b..627b8593 100644 --- a/third_party/fwkacllib/inc/cce/dnn_op.h +++ b/third_party/fwkacllib/inc/cce/dnn_op.h @@ -3461,7 +3461,7 @@ ccStatus_t ccEmbeddingLookupForward(ccHandle_t handle, const void *alpha, const /** * @ingroup - * @brief embedding lookup forward. for nn.embedding_lookup + * @brief embedding lookup forward. * @param [in] handle cce handle * @param [in] alpha common scale factor * @param [in] inputNum inputNum diff --git a/third_party/fwkacllib/inc/cce/dnn_struct.hpp b/third_party/fwkacllib/inc/cce/dnn_struct.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/dnn_struct_base.hpp b/third_party/fwkacllib/inc/cce/dnn_struct_base.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/l2fusion_struct.hpp b/third_party/fwkacllib/inc/cce/l2fusion_struct.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/cce/taskdown_api.h b/third_party/fwkacllib/inc/cce/taskdown_api.h index db06350c..2323aaa7 100644 --- a/third_party/fwkacllib/inc/cce/taskdown_api.h +++ b/third_party/fwkacllib/inc/cce/taskdown_api.h @@ -32,9 +32,17 @@ typedef struct tagOpAddrsInfo { uintptr_t addrData; } ccOpAddrsInfo; +#ifdef __cplusplus +extern "C" { +#endif + ccStatus_t ccUpdateKernelArgs(ccOpContext &opContext, uint64_t dataBaseAddr, uint64_t weightBaseAddr, uint64_t variableBaseAddr, void *argsAddr, uint64_t argsSize, void *l2ctrlAddr); +#ifdef __cplusplus +} +#endif + ccStatus_t ccGetKernelArgsAddrs(ccOpContext &opContext, void *argsAddr, uint64_t argsSize, void *l2ctrlAddr, std::vector &opAddrsInfo); diff --git a/third_party/fwkacllib/inc/cce/taskdown_common.hpp b/third_party/fwkacllib/inc/cce/taskdown_common.hpp old mode 100755 new mode 100644 diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h index 4ca597bb..d85d7bc4 100644 --- a/third_party/fwkacllib/inc/hccl/base.h +++ b/third_party/fwkacllib/inc/hccl/base.h @@ -63,6 +63,9 @@ typedef enum tagHcclResult { HCCL_E_RESERVED /**< reserved */ } hcclResult_t; +/* handle to communicator */ +typedef void *hcclComm_t; + /** * @brief HCCL Reduction opperation */ diff --git a/third_party/fwkacllib/inc/mmpa/mmpa_api.h b/third_party/fwkacllib/inc/mmpa/mmpa_api.h index f1e30538..ce1c9720 100644 --- a/third_party/fwkacllib/inc/mmpa/mmpa_api.h +++ b/third_party/fwkacllib/inc/mmpa/mmpa_api.h @@ -20,7 +20,7 @@ #define LINUX 0 #define WIN 1 -#if(OS_TYPE == LINUX) +#if(OS_TYPE == LINUX) //lint !e553 #ifndef _GNU_SOURCE #define _GNU_SOURCE @@ -84,7 +84,7 @@ #endif -#if(OS_TYPE == WIN) +#if(OS_TYPE == WIN) //lint !e553 #include #include #include "Windows.h" diff --git a/third_party/fwkacllib/inc/ops/aipp.h b/third_party/fwkacllib/inc/ops/aipp.h index d32f6fdf..da2a36ca 100644 --- a/third_party/fwkacllib/inc/ops/aipp.h +++ b/third_party/fwkacllib/inc/ops/aipp.h @@ -20,6 +20,19 @@ #include "../graph/operator_reg.h" namespace ge { +/** +*@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC), image normalization (by subtracting the mean value or multiplying a factor), image cropping (by specifying the crop start and cropping the image to the size required by the neural network), and much more. + +*@par Inputs: +*@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer. +*@li params: Dynamic AIPP configuration parameters of type uint8. + +*@par Attributes: +*aipp_config_path: A required string, specifying the path of the AIPP configuration file + +*@par Outputs: +*features: The AIPP-processed output tensor of type float16 or uint8. +*/ REG_OP(Aipp) .INPUT(images, TensorType{DT_UINT8}) .OPTIONAL_INPUT(params, TensorType{DT_UINT8}) @@ -28,4 +41,21 @@ REG_OP(Aipp) .OP_END_FACTORY_REG(Aipp) } // namespace ge +/** +*@brief Performs This op is for dynamic aipp.If you set aipp-mode to dynamic in aipp config file, framework will auto add one input node to graph at last. + +*@par Attributes: +*index: specify aipp serial num + +*@par Outputs: +*features: The AIPP-processed output tensor of all types. +*/ +namespace ge { +REG_OP(AippData) + .INPUT(data, TensorType::ALL()) + .OUTPUT(out, TensorType::ALL()) + .ATTR(index, Int, 0) + .OP_END_FACTORY_REG(AippData) +} + #endif // GE_OP_AIPP_H diff --git a/third_party/fwkacllib/inc/ops/all_ops.h b/third_party/fwkacllib/inc/ops/all_ops.h index f572f298..36c991ff 100644 --- a/third_party/fwkacllib/inc/ops/all_ops.h +++ b/third_party/fwkacllib/inc/ops/all_ops.h @@ -18,20 +18,30 @@ #define BUILT_IN_OP_PROTO_INC_ALL_OPS_H_ #include "aipp.h" -#include "aipp_data.h" #include "array_ops.h" #include "audio_ops.h" #include "batch_ops.h" #include "bitwise_ops.h" #include "boosted_trees_ops.h" #include "candidate_sampling_ops.h" +#include "clip_boxes.h" #include "control_flow_ops.h" +#include "ctc_ops.h" #include "data_flow_ops.h" +#include "decode_bbox.h" +#include "decode_boundaries_target.h" +#include "decode_cornerpoints_target_bg.h" +#include "decode_cornerpoints_target_wrt_center_v1.h" +#include "decode_wheels_target.h" #include "elewise_calculation_ops.h" -#include "hcom_ops.h" +#include "fastrcnn_predictions.h" +#include "fsrdetectionoutput_ops.h" +#include "functional_ops.h" #include "get_data_ops.h" +#include "hcom_ops.h" #include "image_ops.h" #include "linalg_ops.h" +#include "logging_ops.h" #include "lookup_ops.h" #include "math_ops.h" #include "matrix_calculation_ops.h" @@ -43,23 +53,31 @@ #include "nn_pooling_ops.h" #include "nn_training_ops.h" #include "nonlinear_fuc_ops.h" +#include "no_op.h" #include "npu_loss_scale_ops.h" +#include "outfeed_ops.h" #include "pad_ops.h" +#include "parsing_ops.h" +#include "power_ops.h" #include "quantize_ops.h" +#include "ragged_conversion_ops.h" #include "random_ops.h" #include "reduce_ops.h" +#include "resource_variable_ops.h" +#include "rnn.h" +#include "roipooling_ops.h" #include "rpn_ops.h" +#include "rpn_proposals.h" #include "save_ops.h" #include "selection_ops.h" #include "set_ops.h" #include "sparse_ops.h" #include "split_combination_ops.h" +#include "ssddetectionoutput_ops.h" +#include "stateful_random_ops.h" +#include "stateless_random_ops.h" #include "state_ops.h" -#include "transformation_ops.h" -#include "logging_ops.h" #include "string_ops.h" -#include "outfeed_ops.h" -#include "stateless_random_ops.h" -#include "dvpp_ops.h" -#include "rnn.h" +#include "swap_co_ops.h" +#include "transformation_ops.h" #endif // BUILT_IN_OP_PROTO_INC_ALL_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h index 9354a5e5..74f8924a 100644 --- a/third_party/fwkacllib/inc/ops/array_ops.h +++ b/third_party/fwkacllib/inc/ops/array_ops.h @@ -31,7 +31,8 @@ namespace ge { * @li values:A `Tensor`. Must have the same type as `sorted_x`. *@par Attributes: -*@li out_type:An optional `DType` from: `int32, int64`. Defaults to `int32`. +*@li out_type:An optional `DType` from: `int32, int64`. \n +Defaults to `int32`. *@par Outputs: *y: A `Tensor` of type `out_type`. @@ -60,132 +61,115 @@ REG_OP(LowerBound) *@brief Reverses variable length slices. *@par Inputs: -*The input x can be k-dimensional tensor, num_lower and num_upper can be zero-dimensional scalar. Inputs include: \n -* @li x:A Tensor. The input to reverse. -* @li seq_lengths:A Tensor. Must be one of the following types: int32, int64. 1-D. +*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n +are 0D scalars. +* @li x: A Tensor. The input to reverse. +* @li seq_lengths: A 1D Tensor of type int32 or int64. *@par Attributes: -*@li seq_dim:An optional int. Defaults to 0. The dimension along which reversal is performed. -*@li batch_dim:An optional int. Defaults to 0. The dimension along which reversal is performed. +*@li seq_dim: An optional int. Defaults to "0". The dimension along which \n +reversal is performed. +*@li batch_dim: An optional int. Defaults to "0". The dimension along which \n +reversal is performed. *@par Outputs: -*y: Rank k tensor of the same shape as input. The extracted banded tensor. +*y: A rank k tensor. Has the same shape as input. The extracted banded tensor. *@attention Constraints: \n -*-The implementation for ReverseSequence on Ascend uses AI CPU, with bad performance. - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*ReverseSequence runs on the Ascend AI CPU, which delivers poor performance. */ REG_OP(ReverseSequence) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(seq_lengths, TensorType({DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .REQUIRED_ATTR(seq_dim, Int) .ATTR(batch_dim, Int, 0) .OP_END_FACTORY_REG(ReverseSequence) /** -*@brief Copy a tensor setting everything outside a central band in each innermost matrix. +*@brief Copies a tensor setting everything outside a central band in each innermost matrix. *@par Inputs: -*The input x can be k-dimensional tensor, num_lower and num_upper can be zero-dimensional scalar. Inputs include: \n -* @li x:Rank `k` tensor. -* @li num_lower:0-D tensor. Number of superdiagonals to keep. If negative, keep entire upper triangle. -* @li num_upper:0-D tensor. Number of superdiagonals to keep. If negative, keep entire upper triangle. +*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n +are 0D scalars. +* @li x: A rank k tensor. +* @li num_lower: A 0D tensor. Number of superdiagonals to keep. If negative, \n +keeps entire upper triangle. +* @li num_upper: A 0D tensor. Number of superdiagonals to keep. If negative, \n +keeps entire upper triangle. *@par Outputs: -*y: Rank k tensor of the same shape as input. The extracted banded tensor. +*y: A rank k tensor. Has the same shape as input. The extracted banded tensor. *@attention Constraints: \n -*-The implementation for MatrixBandPart on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*MatrixBandPart runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(MatrixBandPart) .INPUT(x, TensorType({ DT_INT8, DT_UINT8, \ DT_INT16, DT_UINT16, DT_INT32, DT_INT64, - DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL })) + DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL, + DT_COMPLEX64, DT_COMPLEX128 })) .INPUT(num_lower, TensorType({ DT_INT32, DT_INT64 })) .INPUT(num_upper, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL, + DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(MatrixBandPart) /** -*@brief Finds unique elements in a 1-D tensor. +*@brief Finds unique elements in a 1D tensor. *@par Inputs: -*The input x can be k-dimensional tensor, num_lower and num_upper can be zero-dimensional scalar. Inputs include: \n -*x:1-D tensor. +*x: 1D tensor. \n +*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n +are 0D scalars. *@par Attributes: -*out_idx:An optional DType from: int32, int64. Defaults to int32. \n +*out_idx: An optional DType from: "int32, int64". \n +Defaults to "int32". *@par Outputs: -*@li y:A Tensor. Has the same type as x. -*@li idx:A Tensor of type out_idx. -*@li count:A Tensor of type out_idx. +*@li y: A Tensor. Has the same type as "x". +*@li idx: A Tensor of type "out_idx". +*@li count: A Tensor of type "out_idx". *@attention Constraints: \n -*-The implementation for UniqueWithCounts on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UniqueWithCounts runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(UniqueWithCounts) .INPUT(x, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING })) .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING })) .OUTPUT(idx, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(count, TensorType({ DT_INT32, DT_INT64 })) .REQUIRED_ATTR(out_idx, Type) .OP_END_FACTORY_REG(UniqueWithCounts) /** -*@brief Finds unique elements in a 1-D tensor. +*@brief Finds unique elements in a 1D tensor. *@par Inputs: -*The input x can be k-dimensional tensor, num_lower and num_upper can be zero-dimensional scalar. Inputs include: \n -*x:1-D tensor. +*x: 1D tensor. \n +*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n +are 0D scalars. *@par Attributes: -*out_idx:An optional DType from: int32, int64. Defaults to int32. +*out_idx: An optional DType from: "int32, int64". Defaults to "int32". *@par Outputs: -*@li y:x in the unique output y. -*@li idx:A tensor idx the same size as x that contains the index of each value of x. +*@li y: "x" in the unique output "y". +*@li idx: A tensor the same size as "x". The index of each value of "x". *@attention Constraints: \n -*-The implementation for Unique on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*Unique runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(Unique) @@ -198,29 +182,25 @@ REG_OP(Unique) .OP_END_FACTORY_REG(Unique) /** -*@brief Finds unique elements in a 1-D tensor. +*@brief Finds unique elements in a 1D tensor. *@par Inputs: -*The input x can be k-dimensional tensor, num_lower and num_upper can be zero-dimensional scalar. Inputs include: \n -* @li x:1-D tensor. -* @li axis:A `Tensor` of type `int32` (default: None). The axis of the Tensor to. +*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n +are 0D scalars. \n +*Including: +* @li x: 1D tensor. +* @li axis: A Tensor of type int32. Defaults to "None". *@par Attributes: -*out_idx:An optional DType from: int32, int64. Defaults to int32. +*out_idx: An optional DType from: "int32, int64". \n +Defaults to "int32". *@par Outputs: -*@li y:x in the unique output y. -*@li idx:A tensor idx the same size as x that contains the index of each value of x. +*@li y: "x" in the unique output "y". +*@li idx: A tensor the same size as "x". The index of each value of "x". *@attention Constraints: \n -*-The implementation for UniqueExt2 on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UniqueExt2 runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(UniqueExt2) @@ -237,21 +217,13 @@ REG_OP(UniqueExt2) *@brief Computes the inverse permutation of a tensor. *@par Inputs: -*The input x can be k-dimensional tensor. Inputs include: \n -*x:K-D tensor. +*x: A k-dimensional tensor. \n *@par Outputs: -*y:1-D tensor. +*y: A 1D tensor. -*@attention Constraints:\n -*-The implementation for InvertPermutation on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@attention Constraints: \n +*InvertPermutation runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(InvertPermutation) @@ -263,24 +235,16 @@ REG_OP(InvertPermutation) *@brief Checks a tensor for NaN and Inf values. *@par Inputs: -*The input x can be k-dimensional tensor. Inputs include: \n -*x:The input tensor. +*x: A k-dimensional tensor. \n *@par Attributes: -*message:Prefix of the error message. +*message: Prefix of the error message. *@par Outputs: -*y:The output tensor. +*y: The output tensor. *@attention Constraints: \n -*-The implementation for CheckNumerics on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*CheckNumerics runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(CheckNumerics) @@ -293,22 +257,17 @@ REG_OP(CheckNumerics) *@brief Converts an array of flat indices into a tuple of coordinate arrays. *@par Inputs: -*The input indices can be 0-D or 1-D tensor, dims can be 1-D. Inputs include: \n -* @li indices: A 0-D or 1-D int Tensor whose elements are indices into the flattened version of an array of dimensions dims. -* @li dims:A Tensor. Must have the same type as indices. An 1-D int Tensor. The shape of the array to use for unraveling indices. +*Input "indices" is a 0D or 1D tensor. Input "dims" is a 1D tensor. \n +* @li indices: A 0D or 1D int Tensor whose elements are indices into \n +the flattened version of an array of dimensions "dims". +* @li dims: A 1D int Tensor of the same type as "indices". \n +*The shape of the array to use for unraveling indices. *@par Outputs: -*y:A Tensor. Has the same type as indices. +*y: A Tensor. Has the same type as "indices". *@attention Constraints: \n -*-The implementation for UnravelIndex on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UnravelIndex runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(UnravelIndex) @@ -321,25 +280,18 @@ REG_OP(UnravelIndex) *@brief Applies upper_bound(sorted_search_values, values) along each row. *@par Inputs: -*The input sorted_x can be 2-D tensor, values can be 2-D. Inputs include: -* @li sorted_x: 2-D Tensor where each row is ordered. -* @li values:2-D Tensor with the same numbers of rows as `sorted_x. +*Inputs "sorted_x" and "values" are 2D tensors. +* @li sorted_x: A 2D Tensor where each row is ordered. +* @li values: A 2D Tensor with the same numbers of rows as "sorted_x. *@par Attributes: -*out_type:sets the optional out_type attribute to value. +*out_type: sets the optional out_type attribute to value. *@par Outputs: -*y:A `Tensor` with the same shape as `values`. +*y: A Tensor with the same shape as "values". *@attention Constraints: \n -*-The implementation for UpperBound on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UpperBound runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(UpperBound) @@ -352,101 +304,86 @@ REG_OP(UpperBound) .OP_END_FACTORY_REG(UpperBound) /** -*@brief Finds unique elements in a 1-D tensor. +*@brief Finds unique elements in a 1D tensor. *@par Inputs: -*The input x can be 1-D vector, axis can be 1-D vector. Inputs include: \n -* @li x:1-D tensor. -* @li axis:1-D tensor. +*Inputs "x" and "axis" are 1D vectors. \n +* @li x: A 1D tensor. +* @li axis: A 1D tensor. *@par Attributes: -*out_idx:An optional DType from: int32, int64. Defaults to int32. +*out_idx: An optional DType from: "int32, int64". \n +Defaults to "int32". *@par Outputs: -*@li y:x in the unique output y. -*@li idx:A tensor idx the same size as x that contains the index of each value of x. -*@li count:A tensor idx the same size as x that contains the index of each value of x. +*@li y: "x" in the unique output "y". +*@li idx: A tensor the same size as "x". The index of each value of "x". +*@li count: A tensor the same size as "x". The index of each value of "x". *@attention Constraints: \n -*-The implementation for UniqueWithCountsExt2 on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UniqueWithCountsExt2 runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(UniqueWithCountsExt2) .INPUT(x, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING })) .INPUT(axis, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING })) .OUTPUT(idx, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(count, TensorType({ DT_INT32, DT_INT64 })) .REQUIRED_ATTR(out_idx, Type) .OP_END_FACTORY_REG(UniqueWithCountsExt2) /** -*@brief Fill the tensor with the mirror value. +*@brief Fills the tensor with the mirror value. *@par Inputs: -*The input x and paddings can be one-dimensional scalar. Inputs include: \n -* @li x: input tensor to be padded. -* @li paddings: A two-column matrix specifying the padding sizes. The number of rows must be the same as the rank of `input`. +*Inputs "x" and "paddings" are 1D scalars. \n +* @li x: The tensor to be padded. +* @li paddings: A two-column matrix specifying the padding sizes. \n +The number of rows Has the same rank as "x". *@par Attributes: -*mode:Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions do not include the borders, while in symmetric mode the padded regions do include the borders. +*mode: Either "REFLECT" or "SYMMETRIC". In reflect mode the padded regions \n +do not include the borders, while in symmetric mode the padded regions \n +do include the borders. *@par Outputs: *y: The padded tensor. *@attention Constraints: \n --The implementation for MirrorPad on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*MirrorPad runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(MirrorPad) .INPUT(x, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL, \ + DT_COMPLEX64, DT_COMPLEX128 })) .INPUT(paddings, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL, \ + DT_COMPLEX64, DT_COMPLEX128 })) .REQUIRED_ATTR(mode, String) .OP_END_FACTORY_REG(MirrorPad) /** -*@brief Calculate the difference between two numbers or a list of strings. +*@brief Calculates the difference between two numbers or a list of strings. *@par Inputs: -*The input x and y can be one-dimensional vector. Inputs include: \n -* @li x:A Tensor. 1-D. Values to keep. -* @li y:A Tensor. Must have the same type as x. 1-D. Values to remove. +*Inputs "x" and "y" are 1D vectors. \n +* @li x: A Tensor. 1D. Values to keep. +* @li y: A Tensor. Must have the same type as x. 1D. Values to remove. *@par Attributes: -*out_idx:An optional DType from: int32, int64. Defaults to int32. +*out_idx: An optional DType from: "int32, int64". Defaults to "int32". *@par Outputs: -*@li out:A Tensor. Has the same type as x. -*@li idx:A Tensor of type out_idx. - -*@attention Constraints:\n --The implementation for ListDiff on Ascend uses AI CPU, with bad performance. \n +*@li out: A Tensor. Has the same type as "x". +*@li idx: A Tensor of type "out_idx". -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@attention Constraints: \n +*ListDiff runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(ListDiff) @@ -465,9 +402,7 @@ REG_OP(ListDiff) Operator Const has the same definition as operator Constant. *@par Attributes: -*@li value: Required. The value and type of the resulting tensor. -*@li dtype: Optional. The type of the elements of the resulting tensor. \n -The data type specified by this parameter must be the same as that of the "value" attribute. +*value: Required. The value and type of the resulting tensor, and no restrictions on type. *@par Outputs: *y: A constant tensor. @@ -475,17 +410,14 @@ The data type specified by this parameter must be the same as that of the "value REG_OP(Const) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) - .ATTR(value, Tensor, Tensor()) // This is the value of the const op - .ATTR(dtype, Int, 0) + .ATTR(value, Tensor, Tensor()) .OP_END_FACTORY_REG(Const) /** *@brief Creates a constant tensor for training. *@par Attributes: -*@li value: Required. The value and type of the resulting tensor. -*@li dtype: Optional. The type of the elements of the resulting tensor. \n -The data type specified by this parameter must be the same as that of the "value" attribute. +*value: Required. The value and type of the resulting tensor, and no restrictions on type. *@par Outputs: *y: The constant tensor. @@ -493,8 +425,7 @@ The data type specified by this parameter must be the same as that of the "value REG_OP(Constant) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) - .ATTR(value, Tensor, Tensor()) // This is the value of the constant op - .ATTR(dtype, Int, 0) + .ATTR(value, Tensor, Tensor()) .OP_END_FACTORY_REG(Constant) /** @@ -649,8 +580,6 @@ REG_OP(ExpandDims) .INPUT(axis, TensorType({DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) - .ATTR(T, Int, 0) - .ATTR(Tdim, Int, 0) .OP_END_FACTORY_REG(ExpandDims) /** @@ -692,8 +621,6 @@ REG_OP(Reshape) REG_OP(Squeeze) .INPUT(x, TensorType::ALL()) .OUTPUT(y, TensorType::ALL()) - .ATTR(T, Int, 0) - .ATTR(squeeze_dims, ListInt, {}) .ATTR(axis, ListInt, {}) .OP_END_FACTORY_REG(Squeeze) @@ -728,14 +655,25 @@ REG_OP(Size) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_INT32,DT_INT64})) - .ATTR(alpha, Float, 1.0) - .ATTR(beta, Float, 0.0) - .ATTR(out_type, Int, DT_INT32) + .ATTR(dtype, Int, DT_INT32) .OP_END_FACTORY_REG(Size) +/** +*@brief Input data for other operators. + +*@par Inputs: +*x: A tensor. + +*@par Attributes: +*index: Index of the input tensor of type int32 or int64. + +*@par Outputs: +*y: A tensor. + +*/ REG_OP(Data) - .INPUT(data, TensorType::ALL()) - .OUTPUT(out, TensorType::ALL()) + .INPUT(x, TensorType::ALL()) + .OUTPUT(y, TensorType::ALL()) .ATTR(index, Int, 0) .OP_END_FACTORY_REG(Data) @@ -790,9 +728,7 @@ REG_OP(Shape) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_INT32, DT_INT64})) - .ATTR(alpha, Float, 1.0) - .ATTR(beta, Float, 0.0) - .ATTR(out_type, Int, DT_INT32) + .ATTR(dtype, Int, DT_INT32) .OP_END_FACTORY_REG(Shape) /** @@ -811,9 +747,7 @@ REG_OP(ShapeN) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) .DYNAMIC_OUTPUT(y, TensorType({DT_INT32, DT_INT64})) - .ATTR(alpha, Float, 1.0) - .ATTR(beta, Float, 0.0) - .ATTR(out_type, Int, DT_INT32) + .ATTR(dtype, Int, DT_INT32) .OP_END_FACTORY_REG(ShapeN) /** @@ -838,45 +772,94 @@ REG_OP(Empty) .OP_END_FACTORY_REG(Empty) /** -*@brief Gradient op for MirrorPad op. This op folds a mirror-padded tensor. +*@brief Gradient op for MirrorPad op. Folds a mirror-padded tensor. *@par Inputs: -*The input x and y can be one-dimensional vector. Inputs include: \n -* @li x:A Tensor. The input tensor to be folded. -* @li paddings:A Tensor. Must be one of the following types: int32, int64. A two-column matrix specifying the padding sizes. +*Inputs "x" and "y" are 1D vectors. \n +* @li x: A Tensor. The input tensor to be folded. +* @li paddings: A Tensor of type int32 or int64. A two-column matrix \n +specifying the padding sizes. *@par Attributes: -*mode:A string from: "REFLECT", "SYMMETRIC". The mode used in the MirrorPad op. +*mode: A string from: "REFLECT", "SYMMETRIC". The mode used in the MirrorPad op. *@par Outputs: -*y:A Tensor. Has the same type as x. +*y: A Tensor. Has the same type as "x". *@attention Constraints: \n --The implementation for MirrorPadGrad on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*MirrorPadGrad runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(MirrorPadGrad) .INPUT(x, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128 })) .INPUT(paddings, TensorType({DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128 })) .REQUIRED_ATTR(mode, String) .OP_END_FACTORY_REG(MirrorPadGrad) +/** +*@brief Returns locations of nonzero / true values in a tensor. + +*@par Inputs: +*Including: \n +*x: A Tensor. Must be one of the following types: \n +DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \n +DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL. + +*@par Outputs: +*y: A Tensor of type DT_INT64. + +*@attention Constraints:\n +*Where runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(Where) .INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \ DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL})) .OUTPUT(y, TensorType({DT_INT64})) .OP_END_FACTORY_REG(Where) +/** +* multiple output blobs for feeding a blob into multiple output layers. \n +*The Split node is removed from the graph after the split operation is completed. + +*@par Inputs: +*x: A Tensor. Must be one of the following types: float16, float32, int8, int32. + +*@par Outputs: +*y: A Tensor. Has the same type as "x".It's required and the value should equal to output_num. +*/ +REG_OP(Copy) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT32})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT32})) + .OP_END_FACTORY_REG(Copy); + +/** +*@brief Generates fingerprint values. + +*@par Inputs: +*@li data: Must have rank 1 or higher. +*@li method: Fingerprint method used by this op. Currently available method is \n +`farmhash::fingerprint64`. + +*@par Outputs: +y: A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to \n +`data`'s first dimension, and the second dimension size depends on the \n +fingerprint algorithm. + +*/ + +REG_OP(Fingerprint) + .INPUT(data, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \ + DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL})) + .INPUT(method, TensorType({DT_STRING})) + .OUTPUT(y, TensorType({DT_UINT8})) + .OP_END_FACTORY_REG(Fingerprint) } // namespace ge #endif // GE_OP_ARRAY_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/audio_ops.h b/third_party/fwkacllib/inc/ops/audio_ops.h index 41c25492..6dff7712 100644 --- a/third_party/fwkacllib/inc/ops/audio_ops.h +++ b/third_party/fwkacllib/inc/ops/audio_ops.h @@ -22,31 +22,26 @@ namespace ge { /** -*@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of taking the DCT-II of a log-magnitude mel-scale spectrogram. +*@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of \n +taking the DCT-II of a log-magnitude mel-scale spectrogram. -*@par Inputs: -*The input spectrogram must be three-dimensional tensor, sample_rate must be a scalar. Inputs include: \n -* @li spectrogram:3D float tensor of mel-frequency cepstral coefficient. -* @li sample_rate:Mel-Frequency Cepstral Coefficient (MFCC) calculation sample rate. +*@par Inputs: +*Input "spectrogram" is a 3D tensor. Input "sample_rate" is a scalar. \n +* @li spectrogram: A 3D float tensor. +* @li sample_rate: The MFCC sample rate. -*@par Attributes: -*@li upper_frequency_limit:Upper limit of the mfcc calculation frequency. -*@li lower_frequency_limit:Lower limit of the mfcc calculation frequency. -*@li filterbank_channel_count:Count of the channel filterbank. -*@li dct_coefficient_count:Count of the dct coefficient. +*@par Attributes: +*@li upper_frequency_limit: The highest frequency for calculation. +*@li lower_frequency_limit: The lowest frequency for calculation. +*@li filterbank_channel_count: Resolution of the Mel bank. +*@li dct_coefficient_count: Number of output channels to produce \n +per time slice. -*@par Outputs: -*y:A float32 Tensor of the MFCCs of spectrogram. +*@par Outputs: +*y: A Tensor of type float32. -*@attention Constraints:\n -*-The implementation for Mfcc on Ascend uses AI CPU, with bad performance.\n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@attention Constraints: \n +*Mfcc runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(Mfcc) @@ -60,29 +55,23 @@ REG_OP(Mfcc) .OP_END_FACTORY_REG(Mfcc) /** -*@brief Decode and generate spectrogram using wav float tensor. +*@brief Decodes and generates spectrogram using wav float tensor. -*@par Inputs: -*The input x must be two-dimensional matrices. Inputs include: \n -* x:float tensor of the wav audio contents. contains length and channel +*@par Inputs: +*Input "x" is a 2D matrix. \n +* x: A float tensor. Float representation of audio data. -*@par Attributes: -*@li window_size:Size of the spectrogram window. -*@li stride:Size of the spectrogram stride. -*@li magnitude_squared:If true, using magnitude squared. +*@par Attributes: +*@li window_size: Size of the spectrogram window. +*@li stride: Size of the spectrogram stride. +*@li magnitude_squared: If true, uses squared magnitude. -*@par Outputs: -*spectrogram:3-D float Tensor with the image contents. +*@par Outputs: +*spectrogram: A 3D float Tensor. -*@attention Constraints:\n -*-The implementation for AudioSpectrogram on Ascend uses AI CPU, with bad performance.\n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@attention Constraints: \n +*AudioSpectrogram runs on the Ascend AI CPU, which delivers \n +poor performance. */ REG_OP(AudioSpectrogram) @@ -94,29 +83,23 @@ REG_OP(AudioSpectrogram) .OP_END_FACTORY_REG(AudioSpectrogram) /** -*@brief Decode a 16-bit WAV file into a float tensor. +*@brief Decodes a 16-bit WAV file into a float tensor. -*@par Inputs: -*The input contents must be string tensor. Inputs include: \n -* @li contents:A Tensor of type string. The WAV-encoded audio, usually from a file. +*@par Inputs: +*contents: A Tensor of type string. The WAV-encoded audio, usually from a file. -*@par Attributes: -*@li desired_channels:An optional int. Defaults to -1. Number of sample channels wanted. -*@li desired_samples:An optional int. Defaults to -1. Length of audio requested. +*@par Attributes: +*@li desired_channels: An optional int. Defaults to "-1". \n +Number of sample channels wanted. +*@li desired_samples: An optional int. Defaults to "-1". \n +Length of audio requested. -*@par Outputs: -*@li *audio:A Tensor of type float32. -*@li *sample_rate:A Tensor of type int32. +*@par Outputs: +*@li *audio: A Tensor of type float32. +*@li *sample_rate: A Tensor of type int32. *@attention Constraints: \n -*-The implementation for DecodeWav on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*DecodeWav runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(DecodeWav) @@ -127,6 +110,22 @@ REG_OP(DecodeWav) .ATTR(desired_samples, Int, -1) .OP_END_FACTORY_REG(DecodeWav) +/** +*@brief Encode audio data using the WAV file format. + +*@par Inputs: +*Including: \n +* @li audio: A Tensor of type DT_FLOAT. +* @li sample_rate: A Tensor of type DT_INT32. + +*@par Outputs: +*contents: A Tensor of type DT_STRING. + +*@attention Constraints:\n +*EncodeWav runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(EncodeWav) .INPUT(audio, TensorType({DT_FLOAT})) .INPUT(sample_rate, TensorType({DT_INT32})) diff --git a/third_party/fwkacllib/inc/ops/batch_ops.h b/third_party/fwkacllib/inc/ops/batch_ops.h index d9151589..c4d995c4 100644 --- a/third_party/fwkacllib/inc/ops/batch_ops.h +++ b/third_party/fwkacllib/inc/ops/batch_ops.h @@ -22,37 +22,35 @@ namespace ge { /** -*@brief Creates batches of tensors in tensors. - -*@par Inputs: -*The input x_tensors can be a list or a dictionary of tensors. Inputs include: \n -*x_tensors:The list or dictionary of tensors to enqueue. - -*@par Attributes: -*@li num_batch_threads:The number of threads enqueuing tensors. The batching will be nondeterministic if num_batch_threads > 1. -*@li max_batch_size:Max batch size pulled from the queue. -*@li max_enqueued_batches:Maximum number of batches pulled from the queue. -*@li batch_timeout_micros:Batch processing timeout in microseconds unit. -*@li allowed_batch_sizes:Allowed batch size pulled from the queue. -*@li grad_timeout_micros:Calculate the gradient batch processing timeout in microseconds unit. -*@li container:If non-empty, this queue is placed in the given container. Otherwise, a default container is used. -*@li shared_name:If set, this queue will be shared under the given name across multiple sessions. -*@li batching_queue:queue resource container. - -*@par Outputs: -*@li y_index:Tensor, index of a BatchTensor. Must be in row-major order. -*@li y_id:Tensor, id of a BatchTensor. Must be in row-major order. -*@li y_tensors:A list or dictionary of tensors with the same types as tensors. +*@brief Creates batches of tensors in "x_tensors". + +*@par Inputs: +*Input "x_tensors" is a list or a dictionary of tensors. \n +*x_tensors: The list or dictionary of tensors to enqueue. + +*@par Attributes: +*@li num_batch_threads: The number of threads enqueuing "x_tensors". \n +The batching will be nondeterministic if "num_batch_threads" > 1. +*@li max_batch_size: The maximum batch size pulled from the queue. +*@li max_enqueued_batches: The maximum number of batches pulled from the queue. +*@li batch_timeout_micros: The batch processing timeout, in microseconds. +*@li allowed_batch_sizes: The allowed batch size pulled from the queue. +*@li grad_timeout_micros: The gradient batch processing timeout, \n +in microseconds. +*@li container: If non-empty, this queue is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: If set, this queue will be shared under the given name \n +across multiple sessions. +*@li batching_queue: The queue resource container. + +*@par Outputs: +*@li y_index: A Tensor. The index of a BatchTensor. Must be in row-major order. +*@li y_id: A Tensor. The ID of a BatchTensor. Must be in row-major order. +*@li y_tensors: A list or dictionary of tensors with \n +the same types as "x_tensors". *@attention Constraints: \n -*-The implementation for Batch on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*Batch runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(Batch) @@ -76,29 +74,24 @@ REG_OP(Batch) /** *@brief Reverses the operation of Batch for a single output Tensor. -*@par Inputs: -*The input x_tensors can be a list or a dictionary of tensors. Inputs include: \n -* @li x_tensors:The list or dictionary of tensors to enqueue. -* @li index:The matching batch_index obtained from Batch. -* @li id:The id scalar emitted by Batch. +*@par Inputs: +*Input "x_tensors" is a list or a dictionary of tensors. \n +* @li x_tensors: The list or dictionary of tensors to enqueue. +* @li index: The matching "batch_index" obtained from Batch. +* @li id: The "id" scalar emitted by Batch. -*@par Attributes: -*@li timeout_micros:Calculate the unbatch processing timeout in microseconds unit. -*@li container:If non-empty, this queue is placed in the given container. Otherwise, a default container is used. -*@li shared_name:If set, this queue will be shared under the given name across multiple sessions. +*@par Attributes: +*@li timeout_micros: The unbatch processing timeout, in microseconds. +*@li container: If non-empty, this queue is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: If set, this queue will be shared under the given name \n +across multiple sessions. -*@par Outputs: -*y_tensor:A list or dictionary of tensors with the same types as tensors. +*@par Outputs: +*y_tensor: A list or dictionary of tensors with the same types as "x_tensors". *@attention Constraints: \n -*-The implementation for Unbatch on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*Unbatch runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(Unbatch) @@ -114,31 +107,27 @@ REG_OP(Unbatch) .OP_END_FACTORY_REG(Unbatch) /** -*@brief Acts like Batch but using the given batch_index index of batching things as they become available. +*@brief Acts like Batch but using the given "batch_index" index of batching \n +things as they become available. -*@par Inputs: -*The input x_input can be a list or a dictionary of tensors. Inputs include: \n -* @li x_input:The input to the Unbatch operation. -* @li index:The batch_index given to the Unbatch operation. -* @li id:The id scalar emitted by Batch. -* @li grad:The downstream gradient. +*@par Inputs: +*Input "x_input" is a list or a dictionary of tensors. \n +* @li x_input: The input to the Unbatch operation. +* @li index: The batch_index given to the Unbatch operation. +* @li id: The "id" scalar emitted by Batch. +* @li grad: The downstream gradient. -*@par Attributes: -*@li container:If non-empty, this queue is placed in the given container. Otherwise, a default container is used. -*@li shared_name:If set, this queue will be shared under the given name across multiple sessions. +*@par Attributes: +*@li container: If non-empty, this queue is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: If set, this queue will be shared under the given name \n +across multiple sessions. -*@par Outputs: -*y_grad:The return value, either an empty tensor or the batched gradient. +*@par Outputs: +*y_grad: The return value, either an empty tensor or the batched gradient. *@attention Constraints: \n -*-The implementation for UnbatchGrad on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UnbatchGrad runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(UnbatchGrad) diff --git a/third_party/fwkacllib/inc/ops/bitwise_ops.h b/third_party/fwkacllib/inc/ops/bitwise_ops.h index d2ed88a7..53d5c005 100644 --- a/third_party/fwkacllib/inc/ops/bitwise_ops.h +++ b/third_party/fwkacllib/inc/ops/bitwise_ops.h @@ -22,25 +22,20 @@ namespace ge { /** -*@brief Elementwise computes the bitwise right-shift of x and y. +*@brief Element-wise computes the bitwise right-shift of x and y. -*@par Inputs: -*The input x can be k-dimensional tensor, num_lower and num_upper can be zero-dimensional scalar. Inputs include: \n -* @li x:A Tensor. Must be one of the following types: int8, int16, int32, int64, uint8, uint16, uint32, uint64. \n -* @li y:A Tensor. Must have the same type as x. \n +*@par Inputs: +*Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper" \n +are 0D scalars. +* @li x: A Tensor. Must be one of the following types: int8, int16, int32, \n +int64, uint8, uint16, uint32, uint64. \n +* @li y: A Tensor. Has the same type as "x". \n -*@par Outputs: -*@li z:A Tensor. Has the same type as x. \n +*@par Outputs: +* z: A Tensor. Has the same type as "x". \n -*@attention Constraints:\n -*-The implementation for Unique on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@attention Constraints: \n +*Unique runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(RightShift) diff --git a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h index 3f02a4e5..edc57e0c 100644 --- a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h +++ b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h @@ -22,28 +22,25 @@ namespace ge { /** -*@brief Bucketize each feature based on bucket boundaries. +*@brief Bucketizes each feature based on bucket boundaries. -*@par Inputs: -*The input float_values can be 1-D tensor, bucket_boundaries can be 1-D. Inputs include: \n -* @li float_values: List of Rank 1 Tensor each containing float values for a single feature. \n -* @li bucket_boundaries:List of Rank 1 Tensors each containing the bucket boundaries for a single. \n +*@par Inputs: +*Input "float_values" is a 1D tensor. Input "bucket_boundaries" is \n +a list of 1D tensors. +* @li float_values: A list of rank 1 tensors each containing float \n +values for a single feature. +* @li bucket_boundaries: A list of rank 1 tensors each containing \n +the bucket boundaries for a single feature. -*@par Attributes: -*@li num_features:number of features \n +*@par Attributes: +*@li num_features: Number of features \n -*@par Outputs: -*@li y:List of Rank 1 Tensors each containing the bucketized values for a single feature. \n +*@par Outputs: +*@li y: A list of rank 1 tensors each containing the bucketized values for \n +a single feature. *@attention Constraints: \n -*-The implementation for BoostedTreesBucketize on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*BoostedTreesBucketize runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(BoostedTreesBucketize) diff --git a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h index 2e3448fc..c2b5a3f8 100644 --- a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h +++ b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h @@ -22,34 +22,41 @@ namespace ge { /** -*@brief Generates labels for candidate sampling with a learned unigram distribution. - -*@par Inputs: -*The input true_classes must be two-dimensional matrices. Inputs include: \n -*true_classes:A batch_size * num_true matrix, in which each row contains the IDs of the num_true target_classes in the corresponding original label. - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li num_sampled:Number of candidates to randomly sample. -*@li unique:If unique is true, we sample with rejection, so that all sampled candidates in a batch are unique. This requires some approximation to estimate the post-rejection sampling probabilities. -*@li range_max:The sampler will sample integers from the interval [0, range_max). -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -*sampled_candidates:A vector of length num_sampled, in which each element is the ID of a sampled candidate. -*true_expected_count:A batch_size * num_true matrix, representing the number of times each candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. -*sampled_expected_count:A vector of length num_sampled, for each sampled candidate representing the number of times the candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. +*@brief Generates labels for candidate sampling with \n +a learned unigram distribution. + +*@par Inputs: +*Input "true_classes" is a 2D matrix. \n +*true_classes: A "batch_size * num_true" matrix, in which each row contains \n +the IDs of the "num_true" "target_classes" in the corresponding original label. + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li num_sampled: Number of candidates to randomly sample. +*@li unique: If "unique" is true, samples with rejection, \n +so that all sampled candidates in a batch are unique. +*This requires some approximation to estimate the post-rejection \n +sampling probabilities. +*@li range_max: The sampler will sample integers from the interval \n +[0, range_max). +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +*@li sampled_candidates: A vector of length "num_sampled", in which each \n +element is the ID of a sampled candidate. +*@li true_expected_count: A "batch_size * num_true" matrix, representing \n +the number of times each candidate is expected to occur in a batch of sampled \n +candidates. If "unique" is true, then this is a probability. +*@li sampled_expected_count: A vector of length "num_sampled", \n +for each sampled candidate. +*representing the number of times the candidate is expected to occur \n +in a batch of sampled candidates. +* If "unique" is true, then this is a probability. \n *@attention Constraints: \n -*-The implementation for ThreadUnsafeUnigramCandidateSampler on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*ThreadUnsafeUnigramCandidateSampler runs on the Ascend AI CPU, \n +which delivers poor performance. */ REG_OP(ThreadUnsafeUnigramCandidateSampler) @@ -66,34 +73,41 @@ REG_OP(ThreadUnsafeUnigramCandidateSampler) .OP_END_FACTORY_REG(ThreadUnsafeUnigramCandidateSampler) /** -*@brief Generates labels for candidate sampling with a learned unigram distribution. - -*@par Inputs: -*The input true_classes must be two-dimensional matrices. Inputs include: \n -*true_classes:A batch_size * num_true matrix, in which each row contains the IDs of the num_true target_classes in the corresponding original label. - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li num_sampled:Number of candidates to randomly sample. -*@li unique:If unique is true, we sample with rejection, so that all sampled candidates in a batch are unique. This requires some approximation to estimate the post-rejection sampling probabilities. -*@li range_max:The sampler will sample integers from the interval [0, range_max). -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -*@li sampled_candidates:A vector of length num_sampled, in which each element is the ID of a sampled candidate. -*@li true_expected_count:A batch_size * num_true matrix, representing the number of times each candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. -*@li sampled_expected_count:A vector of length num_sampled, for each sampled candidate representing the number of times the candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. +*@brief Generates labels for candidate sampling with a learned \n +unigram distribution. + +*@par Inputs: +*true_classes: A "batch_size * num_true" matrix, in which each row contains \n +the IDs of the "num_true" "target_classes" in the corresponding original label. +*Input "true_classes" is a 2D matrix. + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li num_sampled: Number of candidates to randomly sample. +*@li unique: If "unique" is true, samples with rejection, \n +so that all sampled candidates in a batch are unique. +*This requires some approximation to estimate the post-rejection \n +sampling probabilities. +*@li range_max: The sampler will sample integers from the interval \n +[0, range_max). +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +*@li sampled_candidates: A vector of length "num_sampled", \n +in which each element is the ID of a sampled candidate. +*@li true_expected_count: A "batch_size * num_true" matrix, representing the \n +number of times each candidate is expected to occur \n +in a batch of sampled candidates. +*If "unique" is true, then this is a probability. +*@li sampled_expected_count: A vector of length "num_sampled", for each \n +sampled candidate representing the number of times. +* the candidate is expected to occur in a batch of sampled candidates. \n +*If "unique" is true, then this is a probability. *@attention Constraints: \n -*-The implementation for UniformCandidateSampler on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*UniformCandidateSampler runs on the Ascend AI CPU, \n +which delivers poor performance. */ REG_OP(UniformCandidateSampler) @@ -110,40 +124,53 @@ REG_OP(UniformCandidateSampler) .OP_END_FACTORY_REG(UniformCandidateSampler) /** -*@brief Generates labels for candidate sampling with a learned unigram distribution. - -*@par Inputs: -*The input true_classes can be two-dimensional matrices. Inputs include: \n -*true_classes:A batch_size * num_true matrix, in which each row contains the IDs of the num_true target_classes in the corresponding original label. - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li num_sampled:Number of candidates to randomly sample. -*@li unique:If unique is true, we sample with rejection, so that all sampled candidates in a batch are unique. This requires some approximation to estimate the post-rejection sampling probabilities. -*@li range_max:The sampler will sample integers from the interval [0, range_max). -*@li vocab_file:Each valid line in this file (which should have a CSV-like format) corresponds to a valid word ID. IDs are in sequential order, starting from num_reserved_ids. -*@li distortion:The distortion is used to skew the unigram probability distribution. Each weight is first raised to the distortion's power before adding to the internal unigram distribution. -*@li num_reserved_ids:Optionally some reserved IDs can be added in the range [0, ..., num_reserved_ids) by the users. One use case is that a special unknown word token is used as ID 0. -*@li num_shards:A sampler can be used to sample from a subset of the original range in order to speed up the whole computation through parallelism. -*@li shard:A sampler can be used to sample from a subset of the original range in order to speed up the whole computation through parallelism. -*@li unigrams:A list of unigram counts or probabilities, one per ID in sequential order. -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -*@li sampled_candidates:A vector of length num_sampled, in which each element is the ID of a sampled candidate. -*@li true_expected_count:A batch_size * num_true matrix, representing the number of times each candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. -*@li sampled_expected_count:A vector of length num_sampled, for each sampled candidate representing the number of times the candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. +*@brief Generates labels for candidate sampling with a learned \n +unigram distribution. + +*@par Inputs: +*true_classes: A "batch_size * num_true" matrix, in which each row contains \n +the IDs of the "num_true" "target_classes" in the corresponding original label. +* Input "true_classes" is a 2D matrix. + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li num_sampled: Number of candidates to randomly sample. +*@li unique: If "unique" is true, samples with rejection, \n +so that all sampled candidates in a batch are unique. This requires \n +some approximation to estimate the post-rejection sampling probabilities. +*@li range_max: The sampler will sample integers from the interval [0, range_max). +*@li vocab_file: Each valid line in this file (which should have a \n +CSV-like format) corresponds to a valid word ID. \n +*IDs are in sequential order, starting from num_reserved_ids. +*@li distortion: The distortion is used to skew the unigram probability \n +distribution. Each weight is first raised to the distortion's power before \n +adding to the internal unigram distribution. +*@li num_reserved_ids: Optionally some reserved IDs can be added in the range \n +[0, ..., num_reserved_ids) by the users. \n +* One use case is that a special unknown word token is used as ID 0. +*@li num_shards: A sampler can be used to sample from a subset of the \n +original range. in order to speed up the whole computation through parallelism. +*@li shard: A sampler can be used to sample from a subset of the original \n +range in order to speed up the whole computation through parallelism. +*@li unigrams: A list of unigram counts or probabilities, one per ID in \n +sequential order. +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +*@li sampled_candidates: A vector of length "num_sampled", in which each \n +element is the ID of a sampled candidate. +*@li true_expected_count: A "batch_size * num_true" matrix, representing the \n +number of times each candidate is expected to occur in a batch of sampled \n +candidates. If "unique" is true, then this is a probability. +*@li sampled_expected_count: A vector of length "num_sampled", \n +for each sampled candidate representing the number of times the candidate is \n +expected to occur in a batch of sampled candidates. \n +If "unique" is true, then this is a probability. *@attention Constraints: \n -*-The implementation for FixedUnigramCandidateSampler on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +* FixedUnigramCandidateSampler runs on the Ascend AI CPU, \n +which delivers poor performance. */ REG_OP(FixedUnigramCandidateSampler) @@ -166,34 +193,40 @@ REG_OP(FixedUnigramCandidateSampler) .OP_END_FACTORY_REG(FixedUnigramCandidateSampler) /** -*@brief Generates labels for candidate sampling with a learned unigram distribution. - -*@par Inputs: -*The input true_classes can be two-dimensional matrices. Inputs include: \n -*true_classes:A batch_size * num_true matrix, in which each row contains the IDs of the num_true target_classes in the corresponding original label. - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li num_sampled:Number of candidates to randomly sample. -*@li unique:If unique is true, we sample with rejection, so that all sampled candidates in a batch are unique. This requires some approximation to estimate the post-rejection sampling probabilities. -*@li range_max:The sampler will sample integers from the interval [0, range_max). -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -*@li sampled_candidates:A vector of length num_sampled, in which each element is the ID of a sampled candidate. -*@li true_expected_count:A batch_size * num_true matrix, representing the number of times each candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. -*@li sampled_expected_count:A vector of length num_sampled, for each sampled candidate representing the number of times the candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. +*@brief Generates labels for candidate sampling with a learned \n +unigram distribution. + +*@par Inputs: +*true_classes: A "batch_size * num_true" matrix, in which each row contains \n +the IDs of the "num_true" "target_classes" in the corresponding original label. +* Input "true_classes" is a 2D matrix. + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li num_sampled: Number of candidates to randomly sample. +*@li unique: If "unique" is true, samples with rejection, \n +so that all sampled candidates in a batch are unique. \n +*This requires some approximation to estimate the post-rejection \n +sampling probabilities. +*@li range_max: The sampler will sample integers from the interval \n +[0, range_max). +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +*@li sampled_candidates: A vector of length "num_sampled", in which each \n +element is the ID of a sampled candidate. +*@li true_expected_count: A "batch_size * num_true" matrix, representing \n +the number of times each candidate is expected to occur in a batch of sampled candidates. \n +*If "unique" is true, then this is a probability. +*@li sampled_expected_count: A vector of length "num_sampled", for each \n +sampled candidate representing the number of times the candidate is expected \n +to occur in a batch of sampled candidates. \n +*If "unique" is true, then this is a probability. *@attention Constraints: \n -*-The implementation for LearnedUnigramCandidateSampler on Ascend uses AI CPU, with bad performance. \n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*LearnedUnigramCandidateSampler runs on the Ascend AI CPU, which delivers \n +poor performance. */ REG_OP(LearnedUnigramCandidateSampler) @@ -210,34 +243,39 @@ REG_OP(LearnedUnigramCandidateSampler) .OP_END_FACTORY_REG(LearnedUnigramCandidateSampler) /** -*@brief Generates labels for candidate sampling with a log-uniform distribution. - -*@par Inputs: -*The input true_classes can be two-dimensional matrices. Inputs include: \n -*true_classes:A batch_size * num_true matrix, in which each row contains the IDs of the num_true target_classes in the corresponding original label. - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li num_sampled:Number of candidates to randomly sample. -*@li unique:If unique is true, we sample with rejection, so that all sampled candidates in a batch are unique. This requires some approximation to estimate the post-rejection sampling probabilities. -*@li range_max:The sampler will sample integers from the interval [0, range_max). -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -*@li sampled_candidates:A vector of length num_sampled, in which each element is the ID of a sampled candidate. -*@li true_expected_count:A batch_size * num_true matrix, representing the number of times each candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. -*@li sampled_expected_count:A vector of length num_sampled, for each sampled candidate representing the number of times the candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. - -*@attention Constraints:\n -*-The implementation for LogUniformCandidateSampler on Ascend uses AI CPU, with bad performance.\n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@brief Generates labels for candidate sampling with a log-uniform \n +distribution. + +*@par Inputs: +*true_classes: A "batch_size * num_true" matrix, in which each row contains \n +the IDs of the "num_true" "target_classes" in the corresponding original label. \n +* Input "true_classes" is a 2D matrix. + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li num_sampled: Number of candidates to randomly sample. +*@li unique: If "unique" is true, samples with rejection, so that all \n +sampled candidates in a batch are unique. This requires some approximation \n +to estimate the post-rejection sampling probabilities. +*@li range_max: The sampler will sample integers from the interval \n +[0, range_max). +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +*@li sampled_candidates: A vector of length "num_sampled", in which each \n +element is the ID of a sampled candidate. +*@li true_expected_count: A "batch_size * num_true" matrix, representing \n +the number of times each candidate is expected to occur in a batch of sampled \n +candidates. If "unique" is true, then this is a probability. +*@li sampled_expected_count: A vector of length "num_sampled", for each \n +sampled candidate representing the number of times the candidate is expected \n +to occur in a batch of sampled candidates. \n +*If "unique" is true, then this is a probability. + +*@attention Constraints: \n +*LogUniformCandidateSampler runs on the Ascend AI CPU, which delivers \n +poor performance. */ REG_OP(LogUniformCandidateSampler) @@ -254,33 +292,35 @@ REG_OP(LogUniformCandidateSampler) .OP_END_FACTORY_REG(LogUniformCandidateSampler) /** -*@brief Generates labels for candidate sampling with a learned unigram distribution. - -*@par Inputs: -*The input true_classes can be two-dimensional matrices. Inputs include: \n -*true_classes:A batch_size * num_true matrix, in which each row contains the IDs of the num_true target_classes in the corresponding original label. - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li num_sampled:Number of candidates to randomly sample. -*@li unique:If unique is true, we sample with rejection, so that all sampled candidates in a batch are unique. This requires some approximation to estimate the post-rejection sampling probabilities. -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -*@li sampled_candidates:A vector of length num_sampled, in which each element is the ID of a sampled candidate. -*@li true_expected_count:A batch_size * num_true matrix, representing the number of times each candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. -*@li sampled_expected_count:A vector of length num_sampled, for each sampled candidate representing the number of times the candidate is expected to occur in a batch of sampled candidates. If unique=true, then this is a probability. - -*@attention Constraints:\n -*-The implementation for AllCandidateSampler on Ascend uses AI CPU, with bad performance.\n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@brief Generates labels for candidate sampling with a learned \n +unigram distribution. + +*@par Inputs: +*true_classes: A "batch_size * num_true" matrix, in which each row contains \n +the IDs of the "num_true" "target_classes" in the corresponding original label. \n +* Input "true_classes" is a 2D matrix. + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li num_sampled: Number of candidates to randomly sample. +*@li unique: If "unique" is true, samples with rejection, \n +so that all sampled candidates in a batch are unique. This requires some \n +approximation to estimate the post-rejection sampling probabilities. +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +*@li sampled_candidates: A vector of length "num_sampled", \n +in which each element is the ID of a sampled candidate. +*@li true_expected_count: A "batch_size * num_true" matrix, representing the \n +number of times each candidate is expected to occur in a batch of sampled candidates. \n +*If "unique" is true, then this is a probability. +*@li sampled_expected_count: A vector of length "num_sampled", for each \n +sampled candidate representing the number of times the candidate is expected \n +to occur in a batch of sampled candidates. If "unique" is true, then this is a probability. + +*@attention Constraints: \n +*AllCandidateSampler runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(AllCandidateSampler) @@ -296,32 +336,28 @@ REG_OP(AllCandidateSampler) .OP_END_FACTORY_REG(AllCandidateSampler) /** -*@brief Computes the ids of the positions in sampled_candidates that match true_labels. - -*@par Inputs: -* @li The input true_classes can be two-dimensional matrices. Inputs include: \n -* @li true_classes:The true_classes output of UnpackSparseLabels. \n -* @li sampled_candidates:The sampled_candidates output of CandidateSampler. \n - -*@par Attributes: -*@li num_true:Number of true labels per context. -*@li seed:If either seed or seed2 are set to be non-zero. -*@li seed2:An second seed to avoid seed collision. - -*@par Outputs: -* @li indices:A vector of indices corresponding to rows of true_candidates. -* @li ids:A vector of IDs of positions in sampled_candidates that match a true_label for the row with the corresponding index in indices. -* @li weights:A vector of the same length as indices and ids, in which each element is -FLOAT_MAX. - -*@attention Constraints:\n -*-The implementation for ComputeAccidentalHits on Ascend uses AI CPU, with bad performance.\n - -*@par Quantization supported or not -*Not supported -*@par Quantized inference supported or not -*Supported -*@par L2 convergence supported or not -*@par Multiple batches supported or not +*@brief Computes the "ids" of the positions in "sampled_candidates" that \n +match "true_labels". + +*@par Inputs: +* @li Input "true_classes" is a 2D matrix. \n +* @li true_classes: The "true_classes" output of UnpackSparseLabels. \n +* @li sampled_candidates: The "sampled_candidates" output of CandidateSampler. \n + +*@par Attributes: +*@li num_true: Number of true labels per context. +*@li seed: If either "seed" or "seed2" are set to be non-zero. +*@li seed2: A second seed to avoid seed collision. + +*@par Outputs: +* @li indices: A vector of indices corresponding to rows of "true_candidates". +* @li ids: A vector of IDs of positions in "sampled_candidates" that match a \n +"true_label" for the row with the corresponding index in indices. +* @li weights: A vector of the same length as "indices" and "ids", in which \n +each element is -FLOAT_MAX. + +*@attention Constraints: \n +*ComputeAccidentalHits runs on the Ascend AI CPU, which delivers poor performance. \n */ REG_OP(ComputeAccidentalHits) diff --git a/third_party/fwkacllib/inc/ops/clip_boxes.h b/third_party/fwkacllib/inc/ops/clip_boxes.h new file mode 100644 index 00000000..967dc1b9 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/clip_boxes.h @@ -0,0 +1,37 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_CLIP_BOXES_H + #define GE_OP_CLIP_BOXES_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(ClipBoxes) + .INPUT(boxes_input, TensorType({DT_FLOAT16})) + .INPUT(img_size, TensorType({DT_INT32})) + .OUTPUT(boxes_output, TensorType({DT_FLOAT16})) + .OP_END_FACTORY_REG(ClipBoxes) + + REG_OP(ClipBoxesD) + .INPUT(boxes_input, TensorType({DT_FLOAT16})) + .REQUIRED_ATTR(img_size, ListInt) + .OUTPUT(boxes_output, TensorType({DT_FLOAT16})) + .OP_END_FACTORY_REG(ClipBoxesD) + } // namespace ge + + #endif // GE_OP_CLIP_BOXES_H diff --git a/third_party/fwkacllib/inc/ops/control_flow_ops.h b/third_party/fwkacllib/inc/ops/control_flow_ops.h index 06e047b1..5eebb9e3 100644 --- a/third_party/fwkacllib/inc/ops/control_flow_ops.h +++ b/third_party/fwkacllib/inc/ops/control_flow_ops.h @@ -22,6 +22,26 @@ namespace ge { +/** + *@brief Forwards the value of an available tensor from input "x" to output "y". \n + * Merge waits for at least one of the input tensors to become available. \n + * It is usually combined with Switch to implement branching. \n + * Merge forwards the first tensor to become available to output "y", \n + * and sets "value_index" the index of the tensor in inputs. + + *@par Inputs: + *x: The input tensors, one of which will become available. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Outputs: + *@li y: The available tensor. Has the same type as "x". + *@li value_index: A scalar of type int32, for the index of the chosen input \n + * tensor. + + *@see Switch() + + */ REG_OP(Merge) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -32,6 +52,26 @@ REG_OP(Merge) .OUTPUT(value_index, TensorType({DT_INT32})) .OP_END_FACTORY_REG(Merge) +/** + *@brief Forwards the value of an available tensor from input "x" to output "y". \n + * Merge waits for at least one of the input tensors to become available. \n + * It is usually combined with Switch to implement branching. \n + * Merge forwards the first tensor to become available to output "y", \n + * and sets "value_index" the index of the tensor in inputs. + + *@par Inputs: + *x: The input tensors, one of which will become available. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Outputs: + *@li y: The available tensor. Has the same type as "x". + *@li value_index: A scalar of type int32, for the index of the chosen input \n + * tensor. + + *@see Switch() | Merge() + + */ REG_OP(RefMerge) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -42,6 +82,26 @@ REG_OP(RefMerge) .OUTPUT(value_index, TensorType({DT_INT32})) .OP_END_FACTORY_REG(RefMerge) +/** + *@brief Forwards "data" to the output port determined by "pred". \n + * If "pred" is "true", the data input is forwarded to "output_true". \n + * Otherwise, the data is forwarded to "output_false". + + *@par Inputs: + *@li data: The tensor to be forwarded. \ n + * Must be one of the following types: float16, float32, float64, \n + * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. + *@li pred: A boolean scalar. The output port that will receive data. + + *@par Outputs: + *@li output_false: If "pred" is "false", data will be forwarded to this output. \n + * Has the same type as "data". + *@li output_true: If "pred" is "true", data will be forwarded to this output. \n + * Has the same type as "data". + + *@see Merge() + + */ REG_OP(Switch) .INPUT(data, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -55,6 +115,26 @@ REG_OP(Switch) DT_UINT64, DT_BOOL})) .OP_END_FACTORY_REG(Switch) +/** + *@brief Forwards "data" to the output port determined by "pred". \n + * If "pred" is "true", the data input is forwarded to "output_true". \n + * Otherwise, the data is forwarded to "output_false". + + *@par Inputs: + *@li data: The ref tensor to be forwarded. \n + * Must be one of the following types: float16, float32, float64, \n + * int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool. + *@li pred: A boolean scalar. The output port that will receive data. + + *@par Outputs: + *@li output_false: If "pred" is "false", data will be forwarded to this output. \n + * Has the same type as "data". + *@li output_true: If "pred" is "true", data will be forwarded to this output. \n + * Has the same type as "data". + + *@see Merge() | Switch() + + */ REG_OP(RefSwitch) .INPUT(data, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -78,6 +158,30 @@ REG_OP(SwitchN) DT_UINT64, DT_BOOL})) .OP_END_FACTORY_REG(SwitchN) + +/** + *@brief Creates or finds a child frame, and makes "x" available to the child \n + * frame. This op is used together with Exit to create loops in the graph. \n + * The Executor uses the unique "frame_name" to identify frames. \n + * If "is_constant" is "true", output "y" is a constant in the child \n + * frame; otherwise it may be changed in the child frame. + + *@par Inputs: + *x: The tensor to be made available to the child frame. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Attributes: + *@li frame_name: A required string. The name of the child frame. + *@li is_constant: A required bool. If true, the output is constant in \n + * the child frame. + + *@par Outputs: + *y: A Tensor. Has the same type as "x". + + *@see Exit() + + */ REG_OP(Enter) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -85,10 +189,33 @@ REG_OP(Enter) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_BOOL})) - .ATTR(frame_name, String, "") - .ATTR(is_constant, Bool, false) + .REQUIRED_ATTR(frame_name, String) + .REQUIRED_ATTR(is_constant, Bool) .OP_END_FACTORY_REG(Enter) +/** + *@brief Creates or finds a child frame, and makes "x" available to the child \n + * frame. This op is used together with Exit to create loops in the graph. \n + * The Executor uses the unique "frame_name" to identify frames. \n + * If "is_constant" is "true", output "y" is a constant in the child \n + * frame; otherwise it may be changed in the child frame. + + *@par Inputs: + *x: The tensor to be made available to the child frame. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Attributes: + *@li frame_name: A required string. The name of the child frame. + *@li is_constant: A required bool. If true, the output is constant in \n + * the child frame. + + *@par Outputs: + *y: A tensor. Has the same type as "x". + + *@see Exit() | Enter() + + */ REG_OP(RefEnter) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -96,15 +223,40 @@ REG_OP(RefEnter) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_BOOL})) - .ATTR(frame_name, String, "") - .ATTR(is_constant, Bool, false) + .REQUIRED_ATTR(frame_name, String) + .REQUIRED_ATTR(is_constant, Bool) .OP_END_FACTORY_REG(RefEnter) +/** + *@brief Forwards the input to the output. This op represents the loop \n + * termination condition. + + *@par Inputs: + *x: A boolean scalar. The condition of the Switch op. + + *@par Outputs: + *y: The tensor "x". + + *@see Switch() + + */ REG_OP(LoopCond) .INPUT(x, TensorType({DT_BOOL})) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(LoopCond) +/** + *@brief Makes the input available to the next iteration. + + *@par Inputs: + *x: The tensor to be made available to the next iteration. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Outputs: + *y: A Tensor. Has the same type as "x". + + */ REG_OP(NextIteration) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -114,6 +266,18 @@ REG_OP(NextIteration) DT_UINT64, DT_BOOL})) .OP_END_FACTORY_REG(NextIteration) +/** + *@brief Makes the input available to the next iteration. + + *@par Inputs: + *x: The tensor to be made available to the next iteration. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Outputs: + *y: A tensor. Has the same type as "x". + + */ REG_OP(RefNextIteration) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -123,6 +287,20 @@ REG_OP(RefNextIteration) DT_UINT64, DT_BOOL})) .OP_END_FACTORY_REG(RefNextIteration) +/** + *@brief Exits the current frame to its parent frame. + + *@par Inputs: + *x: The tensor to be made available to the parent frame. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Outputs: + *y: A Tensor. Has the same type as "x". + + *@see Enter() + + */ REG_OP(Exit) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -132,6 +310,20 @@ REG_OP(Exit) DT_UINT64, DT_BOOL})) .OP_END_FACTORY_REG(Exit) +/** + *@brief Exits the current frame to its parent frame. + + *@par Inputs: + *x: The tensor to be made available to the parent frame. \n + * Must be one of the following types: float16, float32, float64, int8, \n + * int16, int32, int64, uint8, uint16, uint32, uint64, bool. + + *@par Outputs: + *y: A tensor. Has the same type as "x". + + *@see Enter() | Exit() + + */ REG_OP(RefExit) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, @@ -141,6 +333,12 @@ REG_OP(RefExit) DT_UINT64, DT_BOOL})) .OP_END_FACTORY_REG(RefExit) +/** + *@brief Only useful as a placeholder for control edges. \n + * It is similar to a no-op that always produces a live control output \n + * even when some control inputs are dead. + + */ REG_OP(ControlTrigger) .OP_END_FACTORY_REG(ControlTrigger) } // namespace ge diff --git a/third_party/fwkacllib/inc/ops/ctc_ops.h b/third_party/fwkacllib/inc/ops/ctc_ops.h new file mode 100644 index 00000000..893c3166 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/ctc_ops.h @@ -0,0 +1,66 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_CTC_OPS_H +#define GE_OP_CTC_OPS_H + +#include "graph/operator.h" +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Calculates the CTC Loss (log probability) for each batch entry. \n +Also calculates the gradient. + +*@par Inputs: +*@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits. +*@li labels_indices: The indices of a `SparseTensor`. \n +`labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for \n +`(batch b, time t)`. +*@li labels_values: The values (labels) associated with the given batch and time. +*@li sequence_length: A vector containing sequence lengths (batch). + +*@par Outputs: +*@li loss: A vector (batch) containing log-probabilities. +*@li gradient: The gradient of `loss`. 3-D, shape: `(max_time x \n +batch_size x num_classes)`. + +*@par Attributes: +*@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to \n +the CTC calculation.If not specified, defaults to false +*@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation \n +repeated non-blank labels will not be merged and are interpreted as \n +individual labels. This is a simplified version of CTC. \n +If not specified, defaults to true + +*/ + +REG_OP(CTCLoss) + .INPUT(inputs, TensorType({DT_FLOAT, DT_DOUBLE})) + .INPUT(labels_indices, TensorType({DT_INT64})) + .INPUT(labels_values, TensorType({DT_INT32})) + .INPUT(sequence_length, TensorType({DT_INT32})) + .OUTPUT(loss, TensorType({DT_FLOAT, DT_DOUBLE})) + .OUTPUT(gradient, TensorType({DT_FLOAT, DT_DOUBLE})) + .ATTR(preprocess_collapse_repeated, Bool, false) + .ATTR(ctc_merge_repeated, Bool, true) + .ATTR(ignore_longer_outputs_than_inputs, Bool, false) + .OP_END_FACTORY_REG(CTCLoss) + +} // namespace ge + +#endif //GE_OP_CTC_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h index e546fb3e..0eaee06c 100644 --- a/third_party/fwkacllib/inc/ops/data_flow_ops.h +++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h @@ -22,16 +22,61 @@ namespace ge { +/** +*@brief This operation returns true if the queue is closed and false if \n +the queue is open. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*handle:A Tensor of type resource. The handle to a queue. + +*@par Outputs: +*is_closed:A Tensor of type bool. + +*/ + REG_OP(QueueIsClosed) .INPUT(handle, TensorType({DT_RESOURCE})) .OUTPUT(is_closed, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(QueueIsClosed) +/** +*@brief Computes the number of elements in the given queue. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*handle:A Tensor of type mutable resource. The handle to a queue. + +*@par Outputs: +*size:A Tensor of type int32. + +*/ + REG_OP(QueueSize) .INPUT(handle, TensorType({DT_RESOURCE})) .OUTPUT(size, TensorType({DT_INT32})) .OP_END_FACTORY_REG(QueueSize) +/** +*@brief A queue that produces elements in first-in first-out order. + +*@par Attributes: +*@li component_types: A list of DType objects. The length of component_types \n +must equal the number of tensors in each queue element. +*@li shapes:(Optional.) A list of fully-defined TensorShape objects with the \n +same length as dtypes, or None. +*@li capacity:An integer. The upper bound on the number of elements that may \n +be stored in this queue. +*@li container: An optional string. Defaults to "". If non-empty, this queue \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name:(Optional.) If non-empty, this queue will be shared under \n +the given name across multiple sessions. + +*@par Outputs: +*handle:A Tensor of type mutable resource. The handle to a queue. + +*/ + REG_OP(FIFOQueue) .OUTPUT(handle, TensorType({DT_RESOURCE})) .REQUIRED_ATTR(component_types, ListType) @@ -41,61 +86,200 @@ REG_OP(FIFOQueue) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(FIFOQueue) +/** +*@brief Enqueues a tuple of one or more tensors in the given queue. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*@li handle:A Tensor of type mutable resource. The handle to a queue. +*@li components: A list of Tensor objects. One or more tensors from which \n +the enqueued tensors should be taken. + +*@par Attributes: +*timeout_ms: An optional int. Defaults to -1. If the queue is full, this \n +operation will block for up to timeout_ms milliseconds. Note: This option \n +is not supported yet. + +*/ + REG_OP(QueueEnqueue) .INPUT(handle, TensorType({DT_RESOURCE})) - .DYNAMIC_INPUT(components, TensorType({DT_INT8, DT_UINT8, \ - DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_UINT32, \ - DT_UINT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL})) + .DYNAMIC_INPUT(components, TensorType({DT_FLOAT, DT_FLOAT16, \ + DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, \ + DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE, DT_RESOURCE, \ + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ + DT_QINT8, DT_QUINT8, DT_QINT32})) .ATTR(timeout_ms, Int, -1) .OP_END_FACTORY_REG(QueueEnqueue) +/** +*@brief Enqueues zero or more tuples of one or more tensors in the given queue. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*@li handle:A Tensor of type mutable resource. The handle to a queue. +*@li components: A list of Tensor objects. One or more tensors from which \n +the enqueued tensors should be taken. + +*@par Attributes: +*timeout_ms: An optional int. Defaults to -1. If the queue is full, this \n +operation will block for up to timeout_ms milliseconds. Note: This option \n +is not supported yet. + +*/ + REG_OP(QueueEnqueueMany) .INPUT(handle, TensorType({DT_RESOURCE})) - .DYNAMIC_INPUT(components, TensorType({DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, \ - DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL})) + .DYNAMIC_INPUT(components, TensorType({DT_FLOAT, DT_FLOAT16, \ + DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, \ + DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE, DT_RESOURCE, \ + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ + DT_QINT8, DT_QUINT8, DT_QINT32})) .ATTR(timeout_ms, Int, -1) .OP_END_FACTORY_REG(QueueEnqueueMany) +/** +*@brief Dequeues n tuples of one or more tensors from the given queue. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*handle:A Tensor of type mutable resource. The handle to a queue. + +*@par Attributes: +*@li timeout_ms: An optional int. Defaults to -1. If the queue is empty, this \n +operation will block for up to timeout_ms milliseconds. Note: This option is \n +not supported yet. +*@li component_types: A list of DTypes that has length >= 1. The type of each \n +component in a tuple. + +*@par Outputs: +*components:A list of Tensor objects of type component_types. + +*/ + REG_OP(QueueDequeue) .INPUT(handle, TensorType({DT_RESOURCE})) - .DYNAMIC_OUTPUT(components, TensorType({DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, \ - DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL})) + .DYNAMIC_OUTPUT(components, TensorType({DT_FLOAT, DT_FLOAT16, \ + DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, \ + DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE, DT_RESOURCE, \ + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ + DT_QINT8, DT_QUINT8, DT_QINT32})) .ATTR(timeout_ms, Int, -1) .REQUIRED_ATTR(component_types, ListType) .OP_END_FACTORY_REG(QueueDequeue) +/** +*@brief Dequeues n tuples of one or more tensors from the given queue. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*@li handle:A Tensor of type mutable resource. The handle to a queue. +*@li n: A Tensor of type int32. The number of tuples to dequeue. + +*@par Attributes: +*@li timeout_ms: An optional int. Defaults to -1. If the queue has fewer than \n +n elements, this operation will block for up to timeout_ms milliseconds. \n +Note: This option is not supported yet. +*@li component_types: A list of DTypes that has length >= 1. The type of each \n +component in a tuple. + +*@par Outputs: +*components:A list of Tensor objects of type component_types. + +*/ + REG_OP(QueueDequeueMany) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(n, TensorType({DT_INT32})) - .DYNAMIC_OUTPUT(components, TensorType({DT_INT8, DT_UINT8, \ - DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, \ - DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL})) + .DYNAMIC_OUTPUT(components, TensorType({DT_FLOAT, DT_FLOAT16, \ + DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, \ + DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE, DT_RESOURCE, \ + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ + DT_QINT8, DT_QUINT8, DT_QINT32})) .ATTR(timeout_ms, Int, -1) .REQUIRED_ATTR(component_types, ListType) .OP_END_FACTORY_REG(QueueDequeueMany) +/** +*@brief Dequeues n tuples of one or more tensors from the given queue. + +*@par Inputs: +*The input handle must have the resource type. Inputs include: \n +*@li handle:A Tensor of type mutable resource. The handle to a queue. +*@li n: A Tensor of type int32. The number of tuples to dequeue. + +*@par Attributes: +*@li timeout_ms: An optional int. Defaults to -1. If the queue has fewer than \n +n elements, this operation will block for up to timeout_ms milliseconds. \n +Note: This option is not supported yet. +*@li component_types: A list of DTypes that has length >= 1. The type of each \n +component in a tuple. + +*@par Outputs: +*components:A list of Tensor objects of type component_types. + +*/ + REG_OP(QueueDequeueUpTo) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(n, TensorType({DT_INT32})) - .DYNAMIC_OUTPUT(components, TensorType({DT_INT8, DT_UINT8, \ - DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_UINT32, \ - DT_UINT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL})) + .DYNAMIC_OUTPUT(components, TensorType({DT_FLOAT, DT_FLOAT16, \ + DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, \ + DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE, DT_RESOURCE, \ + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ + DT_QINT8, DT_QUINT8, DT_QINT32})) .ATTR(timeout_ms, Int, -1) .REQUIRED_ATTR(component_types, ListType) .OP_END_FACTORY_REG(QueueDequeueUpTo) +/** +*@brief Stage values similar to a lightweight Enqueue. + +*@par Inputs: +*The input values must be a list of Tensor objects. Inputs include: \n +*values: A list of Tensor objects. A list of data types that inserted values \n +should adhere to. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to 0. Maximum number of \n +elements in the Staging Area. If > 0, inserts on the container will block \n +when the capacity is reached. +*@li memory_limit: An optional int that is >= 0. Defaults to 0. The maximum \n +number of bytes allowed for Tensors in the Staging Area. If > 0, inserts will \n +block until sufficient space is available. +*@li container: An optional string. Defaults to "". If non-empty, this queue \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". It is necessary to \n +match this name to the matching Unstage Op. + +*@see Unstage + +*/ + REG_OP(Stage) .DYNAMIC_INPUT(values, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, \ DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE, DT_UINT32, DT_UINT64})) + DT_DOUBLE})) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(container, String, "") .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(Stage) +/** +*@brief Op removes all elements in the underlying container. + +*@par Attributes: +*@li capacity: A list of DTypes +*@li memory_limit: An optional int that is >= 0. Defaults to 0. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". +*@li dtypes: A list of DTypes. + +*@see Stage + +*/ + REG_OP(StageClear) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) @@ -104,11 +288,31 @@ REG_OP(StageClear) .ATTR(dtypes, ListType, {}) .OP_END_FACTORY_REG(StageClear) +/** +*@brief Op peeks at the values at the specified index. If the underlying \n +container does not contain sufficient elements this op will block until it does. + +*@par Inputs: +*The input values must be type int32. Inputs include: \n +*values: A Tensor of type int32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to 0. +*@li memory_limit: An optional int that is >= 0. Defaults to 0. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". +*@li dtypes: A list of DTypes that has length >= 1. + +*@par Outputs: +*y:A list of Tensor objects of type dtypes. + +*/ + REG_OP(StagePeek) .INPUT(index, TensorType({DT_INT32})) .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE, DT_UINT32, DT_UINT64})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ + DT_DOUBLE})) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(container, String, "") @@ -116,6 +320,21 @@ REG_OP(StagePeek) .ATTR(dtypes, ListType, {}) .OP_END_FACTORY_REG(StagePeek) +/** +*@brief Op returns the number of elements in the underlying container. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to 0. +*@li memory_limit: An optional int that is >= 0. Defaults to 0. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". +*@li dtypes: A list of DTypes that has length >= 1. + +*@par Outputs: +*size:A Tensor of type int32. + +*/ + REG_OP(StageSize) .OUTPUT(size, TensorType({DT_INT32})) .ATTR(capacity, Int, 0) @@ -125,29 +344,86 @@ REG_OP(StageSize) .ATTR(dtypes, ListType, {}) .OP_END_FACTORY_REG(StageSize) +/** +*@brief Pop the element at the top of the stack. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*handle: A Tensor of type resource. The handle to a stack. + +*@par Attributes: +*elem_type: A DType. The type of the elem that is popped. + +*@par Outputs: +*element:A Tensor of type elem_type. + +*/ + REG_OP(StackPop) .INPUT(handle, TensorType({DT_RESOURCE})) .OUTPUT(element, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE, DT_UINT32, DT_UNIT64})) + DT_DOUBLE})) .REQUIRED_ATTR(elem_type, Type) .OP_END_FACTORY_REG(StackPop) +/** +*@brief Push an element onto the stack. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: A Tensor of type resource. The handle to a stack. +*@li elem: A Tensor. The tensor to be pushed onto the stack. + +*@par Attributes: +*swap_memory: An optional bool. Defaults to False. Swap elem to CPU. Default \n +to false. + +*@par Outputs: +*y:A Tensor. Has the same type as elem. + +*/ + REG_OP(StackPush) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(element, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE, DT_UINT32, DT_UNIT64})) + DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE, DT_UINT32, DT_UNIT64})) + DT_DOUBLE})) .ATTR(swap_memory, Bool, false) .OP_END_FACTORY_REG(StackPush) +/** +*@brief Close the stack. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*handle: A Tensor of type resource. The handle to a stack. + +*/ + REG_OP(StackClose) .INPUT(handle, TensorType({DT_RESOURCE})) .OP_END_FACTORY_REG(StackClose) +/** +*@brief Create a stack. + +*@par Inputs: +*The input max_size must be type int32. Inputs include: \n +*max_size: A Tensor of type int32. The number of elements of a stack. + +*@par Attributes: +*@li stack_name: An optional string. Defaults to "". +*@li elem_type: The elements type of the created Stack. + +*@par Outputs: +*handle: A Tensor of type resource. The handle to a stack. + +*/ + REG_OP(Stack) .INPUT(max_size, TensorType({DT_INT32})) .OUTPUT(handle, TensorType({DT_RESOURCE})) @@ -155,33 +431,124 @@ REG_OP(Stack) .REQUIRED_ATTR(elem_type, Type) .OP_END_FACTORY_REG(Stack) +/** +*@brief Partitions "x" into "num_partitions" tensors using indices from "partitions". + +*@par Inputs: +*Including: \n +* @li x: The Tensor to be sliced. Must be one of the following types: \n +DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \n +DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ +DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING. +* @li partitions: A Tensor of type DT_INT32, with any shape. The indices. + +*@par Attributes: +*num_partitions: The number of partitions to output. + +*@par Outputs: +*y: A list of tensors of type DT_INT32. + +*@attention Constraints:\n +*DynamicPartition runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(DynamicPartition) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(partitions, TensorType({DT_INT32})) .DYNAMIC_OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .ATTR(num_partitions, Int, 1) .OP_END_FACTORY_REG(DynamicPartition) +/** +*@brief Interleaves the values from the "x" tensors into a single tensor. + +*@par Inputs: +*Including: \n +* @li indices: A list of at least 1 Tensor objects with type DT_INT32. +* @li x: A list with the same length as "indices" of Tensor objects. \n +Must be one of the following types: DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \n +DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_QINT32, \n +DT_QUINT8, DT_QINT8, DT_STRING, DT_COMPLEX64, DT_COMPLEX128. + +*@par Attributes: +*N: An int that is >= 1. Defaults to "1". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*@attention Constraints:\n +*DynamicStitch runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(DynamicStitch) .DYNAMIC_INPUT(indices, TensorType({DT_INT32})) .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_QINT32, DT_QUINT8, DT_QINT8, DT_STRING, DT_COMPLEX64, \ + DT_COMPLEX128})) .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_QINT32, DT_QUINT8, DT_QINT8, DT_STRING, DT_COMPLEX64, \ + DT_COMPLEX128})) .ATTR(N, Int, 1) .OP_END_FACTORY_REG(DynamicStitch) +/** +*@brief Interleaves the values from the "x" tensors into a single tensor. + +*@par Inputs: +*Including: \n +* @li indices: A list of at least 1 Tensor objects with type DT_INT32. +* @li x: A list with the same length as "indices" of Tensor objects. \n +Must be one of the following types: DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \n +DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING, \n +DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT32. + +*@par Attributes: +*N: An int that is >= 1. Defaults to "1". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*@attention Constraints:\n +*ParallelDynamicStitch runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(ParallelDynamicStitch) .DYNAMIC_INPUT(indices, TensorType({DT_INT32})) - .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) - .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .DYNAMIC_INPUT(x, + TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, \ + DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT32 })) + .OUTPUT(y, + TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, \ + DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT32 })) .ATTR(N, Int, 1) .OP_END_FACTORY_REG(ParallelDynamicStitch) +/** +*@brief Removes all elements in the underlying container. + +*@par Attributes:An optional int that is >= 0. Defaults to "0". +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of tf.DTypes. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@attention Constraints:\n +*MapClear runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapClear) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) @@ -190,6 +557,24 @@ REG_OP(MapClear) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapClear) +/** +*@brief Returns the number of incomplete elements in the underlying container. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of tf.DTypes. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*size: A Tensor of type DT_INT32. + +*@attention Constraints:\n +*MapIncompleteSize runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapIncompleteSize) .OUTPUT(size, TensorType({DT_INT32})) .ATTR(capacity, Int, 0) @@ -199,10 +584,25 @@ REG_OP(MapIncompleteSize) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapIncompleteSize) +/** +*@brief Unstage Op is similar to a lightweight Dequeue. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to 0. +*@li memory_limit: An optional int that is >= 0. Defaults to 0. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". +*@li dtypes: A list of DTypes that has length >= 1. + +*@par Outputs: +*y: A list of Tensor objects of type dtypes. + +*/ + REG_OP(Unstage) .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE, DT_UINT32, DT_UINT64})) + DT_DOUBLE})) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(container, String, "") @@ -210,13 +610,46 @@ REG_OP(Unstage) .REQUIRED_ATTR(dtypes, ListType) .OP_END_FACTORY_REG(Unstage) +/** +*@brief Stage (key, values) in the underlying container which behaves like a hashtable. + +*@par Inputs: +*Including: \n +* @li key: A Tensor of type DT_INT64. +* @li indices: A Tensor of type DT_INT32. +* @li values: A list of Tensor objects for tensor dtypes. \n +A list of data types that inserted values should adhere to of. \n +Must be one of the following types: DT_FLOAT, DT_FLOAT16, \n +DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, \n +DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, DT_UINT64, \n +DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \n +DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". \n +Maximum number of elements in the Staging Area. If > 0, \n +inserts on the container will block when the capacity is reached. +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of tf.DTypes. +*@li container: An optional string. Defaults to "". \n +If non-empty, this queue is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". \n +It is necessary to match this name to the matching Unstage Op. + +*@attention Constraints:\n +*MapStage runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapStage) .INPUT(key, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT32})) .DYNAMIC_INPUT(values, - TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ - DT_UINT32, DT_UINT64})) + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -224,13 +657,41 @@ REG_OP(MapStage) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapStage) +/** +*@brief Removes and returns the values associated with the key. + +*@par Inputs: +*Including: \n +* @li key: A Tensor of type DT_INT64. +* @li indices: A Tensor of type DT_INT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes that has length >= 1. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*values: A list of Tensor objects. Must be one of the following types: \n +DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, \n +DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, DT_UINT64, DT_RESOURCE, \n +DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, \n +DT_QINT16, DT_QUINT16, DT_QINT32. + +*@attention Constraints:\n +*MapUnstage runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapUnstage) .INPUT(key, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT32})) .DYNAMIC_OUTPUT(values, - TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ - DT_UINT32, DT_UINT64})) + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -238,13 +699,41 @@ REG_OP(MapUnstage) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapUnstage) +/** +*@brief Removes and returns a random (key, value). + +*@par Inputs: +*Including: \n +*indices: A Tensor of type DT_INT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes that has length >= 1. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*@li key: A Tensor of type DT_INT64. +*@li values: A list of Tensor objects. \n +Must be one of the following types: DT_FLOAT, DT_FLOAT16, DT_INT8, \n +DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \n +DT_UINT32, DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \n +DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32. + +*@attention Constraints:\n +*MapUnstageNoKey runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapUnstageNoKey) .INPUT(indices, TensorType({DT_INT32})) .OUTPUT(key, TensorType({DT_INT64})) .DYNAMIC_OUTPUT(values, - TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ - DT_UINT32, DT_UINT64})) + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -252,13 +741,41 @@ REG_OP(MapUnstageNoKey) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapUnstageNoKey) +/** +*@brief Peeks at the values at the specified key. + +*@par Inputs: +*Including: \n +* @li key: A Tensor of type DT_INT64. +* @li indices: A Tensor of type DT_INT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of tf.DTypes that has length >= 1. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*values: A list of Tensor objects of type "dtypes". \n +Must be one of the following types: DT_FLOAT, DT_FLOAT16, DT_INT8, \n +DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \n +DT_DOUBLE, DT_UINT32, DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, \n +DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32. + +*@attention Constraints:\n +*MapPeek runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapPeek) .INPUT(key, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT32})) .DYNAMIC_OUTPUT(values, - TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ - DT_UINT32, DT_UINT64})) + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -266,6 +783,24 @@ REG_OP(MapPeek) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapPeek) +/** +*@brief Returns the number of elements in the underlying container. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of tf.DTypes. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*size: A Tensor of type DT_INT32. + +*@attention Constraints:\n +*MatMul runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(MapSize) .OUTPUT(size, TensorType({DT_INT32})) .ATTR(capacity, Int, 0) @@ -275,6 +810,33 @@ REG_OP(MapSize) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(MapSize) +/** +*@brief Class wrapping dynamic-sized, per-time-step, write-once Tensor arrays. + +*@par Inputs: +*The input size must be type int32. Inputs include: \n +*@li size: int32 scalar Tensor: the size of the TensorArray. Required if \n +handle is not provided. + +*@par Attributes: +*@li dtype: The data type of this TensorArray. +*@li element_shape: The TensorShape of elements in this TensorArray. +*@li dynamic_size: A boolean that determines whether writes to the \n +TensorArray are allowed to grow the size. +*@li clear_after_read: Boolean (optional, default: True). If True, clear \n +TensorArray values \n +after reading them. This disables read-many semantics, but allows early \n +release of memory. +*@li identical_element_shapes: If true (default is false), then all elements \n +in the TensorArray will be expected to have have identical shapes. +*@li tensor_array_name: String: the name of the TensorArray. + +*@par Outputs: +*@li handle: The handle to the TensorArray. +*@li flow: A scalar used to control gradient flow. + +*/ + REG_OP(TensorArray) .INPUT(size, TensorType({DT_INT32})) .OUTPUT(handle, TensorType({DT_RESOURCE})) @@ -287,30 +849,106 @@ REG_OP(TensorArray) .ATTR(tensor_array_name, String, "") .OP_END_FACTORY_REG(TensorArray) +/** +*@brief Delete the TensorArray from its resource container. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*handle: A Tensor of type resource. The handle to a TensorArray \n +(output of TensorArray or TensorArrayGrad). + +*/ + REG_OP(TensorArrayClose) .INPUT(handle, TensorType({DT_RESOURCE})) .OP_END_FACTORY_REG(TensorArrayClose) +/** +*@brief Concat the elements from the TensorArray into value value. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Attributes: +*@li dtype: The type of the elem that is returned. +*@li element_shape_except0: The expected shape of an element, if known, \n +excluding the first dimension. + +*@par Outputs: +*@li value: All of the elements in the TensorArray, concatenated along \n +the first axis. +*@li lengths: A vector of the row sizes of the original T elements in the \n +value output. + +*/ + REG_OP(TensorArrayConcat) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(flow_in, TensorType({DT_FLOAT})) - .OUTPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, \ - DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL})) + .OUTPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, + DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, + DT_QUINT8, DT_QINT32})) .OUTPUT(lengths, TensorType({DT_INT64})) .REQUIRED_ATTR(dtype, Type) .ATTR(element_shape_except0, ListInt, ge::UNKNOWN_SHAPE) .OP_END_FACTORY_REG(TensorArrayConcat) +/** +*@brief All elements selected by indices must have the same shape. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li indices: The locations in the TensorArray from which to read tensor \n +elements. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Attributes: +*@li dtype: The type of the elem that is returned. +*@li element_shape: The expected shape of an element, if known. Used to \n +validate the shapes of TensorArray elements. If this shape is not fully \n +specified, gathering zero-size TensorArrays is an error. + +*@par Outputs: +*value: All of the elements in the TensorArray, concatenated along a new \n +axis (the new dimension 0). + +*/ + REG_OP(TensorArrayGather) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(indices, TensorType({DT_INT32})) .INPUT(flow_in, TensorType({DT_FLOAT})) - .OUTPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, \ - DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL})) + .OUTPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, + DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, + DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, + DT_QUINT8, DT_QINT32})) .REQUIRED_ATTR(dtype, Type) .ATTR(element_shape, ListInt, ge::UNKNOWN_SHAPE) .OP_END_FACTORY_REG(TensorArrayGather) +/** +*@brief Creates a TensorArray for storing the gradients of values in the \n +given handle. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Attributes: +*source: The gradient source string, used to decide which gradient \n +TensorArray to return. + +*@par Outputs: +*@li grad_handle: A Tensor of type resource. +*@li flow_out: A Tensor of type float. + +*/ + REG_OP(TensorArrayGrad) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(flow_in, TensorType({DT_FLOAT})) @@ -319,15 +957,53 @@ REG_OP(TensorArrayGrad) .REQUIRED_ATTR(source, String) .OP_END_FACTORY_REG(TensorArrayGrad) +/** +*@brief Push an element onto the tensor_array. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li index: The position to write to inside the TensorArray. +*@li value: The tensor to write to the TensorArray. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Outputs: +*flow_out: A float scalar that enforces proper chaining of operations. + +*/ + REG_OP(TensorArrayWrite) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(index, TensorType({DT_INT32})) - .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, \ - DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL})) + .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, + DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, + DT_STRING, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(flow_in, TensorType({DT_FLOAT})) .OUTPUT(flow_out, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(TensorArrayWrite) +/** +*@brief Creates a TensorArray for storing multiple gradients of values in \n +the given handle. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: A Tensor of type resource. The handle to the forward TensorArray. +*@li flow_in: A Tensor of type float. A float scalar that enforces proper \n +chaining of operations. +*@li shape_to_prepend: A Tensor of type int32. An int32 vector representing \n +a shape. + +*@par Attributes: +*source: A string. The gradient source string, used to decide which gradient \n +TensorArray to return. + +*@par Outputs: +*@li grad_handle: A Tensor of type resource. +*@li flow_out: A Tensor of type float. + +*/ + REG_OP(TensorArrayGradWithShape) .INPUT(handle, TensorType({ DT_RESOURCE })) .INPUT(flow_in, TensorType({ DT_FLOAT })) @@ -337,39 +1013,123 @@ REG_OP(TensorArrayGradWithShape) .ATTR(source, String, "") .OP_END_FACTORY_REG(TensorArrayGradWithShape) +/** +*@brief Read an element from the TensorArray into output value. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: A Tensor of type resource. The handle to a TensorArray. +*@li index: A Tensor of type int32. +*@li flow_in: A Tensor of type float. + +*@par Attributes: +*dtype: A DType. + +*@par Outputs: +*y: A Tensor of type dtype. + +*/ + REG_OP(TensorArrayRead) .INPUT(handle, TensorType({ DT_RESOURCE })) .INPUT(index, TensorType({ DT_INT32 })) .INPUT(flow_in, TensorType({ DT_FLOAT })) - .OUTPUT(y, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE })) + .OUTPUT(y, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, + DT_STRING, DT_COMPLEX64, DT_COMPLEX128})) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(TensorArrayRead) +/** +*@brief Scatter the data from the input value into specific TensorArray \n +elements. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li indices: The locations at which to write the tensor elements. +*@li value: The concatenated tensor to write to the TensorArray. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Outputs: +*flow_out: A float scalar that enforces proper chaining of operations. + +*/ + REG_OP(TensorArrayScatter) .INPUT(handle, TensorType({ DT_RESOURCE })) .INPUT(indices, TensorType({ DT_INT32 })) - .INPUT(value, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE })) + .INPUT(value, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, + DT_STRING, DT_COMPLEX64, DT_COMPLEX128 })) .INPUT(flow_in, TensorType({ DT_FLOAT })) .OUTPUT(flow_out, TensorType({ DT_FLOAT })) .OP_END_FACTORY_REG(TensorArrayScatter) +/** +*@brief Split the data from the input value into TensorArray elements. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li value: The concatenated tensor to write to the TensorArray. +*@li lengths: The vector of lengths, how to split the rows of value into \n +the TensorArray. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Outputs: +*flow_out: A float scalar that enforces proper chaining of operations. + +*/ + REG_OP(TensorArraySplit) .INPUT(handle, TensorType({ DT_RESOURCE })) - .INPUT(value, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE })) + .INPUT(value, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, + DT_STRING, DT_COMPLEX64, DT_COMPLEX128 })) .INPUT(lengths, TensorType({ DT_INT64 })) .INPUT(flow_in, TensorType({ DT_FLOAT })) .OUTPUT(flow_out, TensorType({ DT_FLOAT })) .OP_END_FACTORY_REG(TensorArraySplit) +/** +*@brief Return the number of elements in a TensorArray. + +*@par Inputs: +*The input handle must be type resource. Inputs include: \n +*@li handle: The handle to a TensorArray. +*@li flow_in: A float scalar that enforces proper chaining of operations. + +*@par Outputs: +*size: The number of elements in a TensorArray.. + +*/ + REG_OP(TensorArraySize) .INPUT(handle, TensorType({ DT_RESOURCE })) .INPUT(flow_in, TensorType({ DT_FLOAT })) .OUTPUT(size, TensorType({ DT_INT32 })) .OP_END_FACTORY_REG(TensorArraySize) +/** +*@brief A queue implementation that dequeues elements in a random order. + +*@par Attributes: +*@li shapes: (Optional.) A list of fully-defined TensorShape objects with \n +the same length as dtypes, or None. +*@li capacity: An integer. The upper bound on the number of elements that may \n +be stored in this queue. +*@li min_after_dequeue: An integer (described above). +*@li seed: An integer. Used to create a random seed. +*@li seed2: An integer. Used to create a random seed. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*handle: A Tensor of type resource. The handle to a stack. + +*/ + REG_OP(RandomShuffleQueue) .OUTPUT(handle, TensorType({DT_RESOURCE})) .REQUIRED_ATTR(component_types, ListType) @@ -382,6 +1142,34 @@ REG_OP(RandomShuffleQueue) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(RandomShuffleQueue) +/** +*@brief A queue that produces elements in first-in first-out order. + +*@par Attributes: +*@li shapes: An optional list of shapes for each component of \n +a queue element. Defaults to {}. The length of this attr must be \n +either 0 or the same as the length of "component_types". Shapes of fixed \n +rank but variable size are allowed by setting any shape dimension to "-1". \n +In this case, the inputs' shape may vary along the given dimension, \n +and DequeueMany will pad the given dimension with zeros up to the maximum \n +shape of all elements in the given batch. If the length of this attr is "0", \n +different queue elements may have different ranks and shapes, but only one \n +element may be dequeued at a time. +*@li capacity: An optional int. Defaults to "-1". The upper bound on the number \n +of elements in this queue. Negative numbers mean no limit. +*@li container: An optional string. Defaults to "". If non-empty, this queue \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". If non-empty, this queue \n +will be shared under the given name across multiple sessions. + +*@par Outputs: +*handle: A Tensor of type DT_RESOURCE. + +*@attention Constraints:\n +*PaddingFIFOQueue runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(PaddingFIFOQueue) .OUTPUT(handle, TensorType({DT_RESOURCE})) .REQUIRED_ATTR(component_types, ListType) @@ -391,6 +1179,29 @@ REG_OP(PaddingFIFOQueue) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(PaddingFIFOQueue) +/** +*@brief A queue that produces elements sorted by the first component value. + +*@par Attributes: +*@li component_types: An optional list of tf.DTypes. Defaults to {}. \n +The type of each component in a value. +*@li shapes: A list of shapes for each component of a queue element. +The length of this attr must be either 0 or the same as the length of \n +"component_types". If the length of this attr is 0, the shapes of queue \n +elements are not constrained, and only one element may be dequeued at a time. +*@li container: An optional string. Defaults to "". If non-empty, this queue \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". If non-empty, this \n +queue will be shared under the given name across multiple sessions. + +*@par Outputs: +*handle: A Tensor of type DT_RESOURCE. + +*@attention Constraints:\n +*PriorityQueue runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(PriorityQueue) .OUTPUT(handle, TensorType({DT_RESOURCE})) .ATTR(component_types, ListType, {}) @@ -400,17 +1211,67 @@ REG_OP(PriorityQueue) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(PriorityQueue) +/** +*@brief Multiplies the matrix "x1" by the matrix "x2". + +*@par Inputs: +*Including: \n +*handle: A Tensor of type DT_RESOURCE. The handle to a queue. + +*@par Attributes: +*cancel_pending_enqueues: An optional bool. Defaults to "False". \n +If true, all pending enqueue requests that are blocked on \n +the given queue will be canceled. + +*@attention Constraints:\n +*QueueClose runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(QueueClose) .INPUT(handle, TensorType({DT_RESOURCE})) .ATTR(cancel_pending_enqueues, Bool, false) .OP_END_FACTORY_REG(QueueClose) +/** +*@brief Stage (key, values) in the underlying container which behaves like an ordered associative container. + +*@par Inputs: +*Including: \n +* @li key: A Tensor of type DT_INT64. +* @li indices: A Tensor of type DT_INT32. +* @li values: A list of Must be one of the following types: \n +DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, \n +DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, DT_UINT64, \n +DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, \n +DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 that inserted values should adhere to. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". \n +Maximum number of elements in the Staging Area. \n +If > 0, inserts on the container will block \n +when the capacity is reached. +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes. +*@li container: An optional string. Defaults to "". \n +If non-empty, this queue is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". \n +It is necessary to match this name to the matching Unstage Op. + +*@attention Constraints:\n +*OrderedMapStage runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(OrderedMapStage) .INPUT(key, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT32})) - .DYNAMIC_INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, - DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, - DT_DOUBLE, DT_BOOL, DT_UINT32, DT_UINT64})) + .DYNAMIC_INPUT(values, + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -418,6 +1279,24 @@ REG_OP(OrderedMapStage) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapStage) +/** +*@brief Returns the number of elements in the underlying container. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*size: A Tensor of type DT_INT32. + +*@attention Constraints:\n +*OrderedMapSize runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(OrderedMapSize) .OUTPUT(size, TensorType({DT_INT32})) .ATTR(capacity, Int, 0) @@ -427,6 +1306,21 @@ REG_OP(OrderedMapSize) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapSize) +/** +*@brief Removes all elements in the underlying container. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@attention Constraints:\n +*OrderedMapClear runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(OrderedMapClear) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) @@ -435,6 +1329,25 @@ REG_OP(OrderedMapClear) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapClear) +/** +*@brief Returns the number of incomplete elements in the underlying container. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*size: A Tensor of type DT_INT32. + +*@attention Constraints:\n +*OrderedMapIncompleteSize runs on the Ascend AI CPU, \n +which delivers poor performance.\n + +*/ + REG_OP(OrderedMapIncompleteSize) .OUTPUT(size, TensorType({DT_INT32})) .ATTR(capacity, Int, 0) @@ -444,12 +1357,40 @@ REG_OP(OrderedMapIncompleteSize) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapIncompleteSize) +/** +*@brief Peeks at the values at the specified key. + +*@par Inputs: +*Including: \n +* @li key: A Tensor of type DT_INT64. +* @li indices: A Tensor of type DT_INT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes that has length >= 1. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*values: A list of Tensor objects. Must be one of the following types: \n +DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, \n +DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, DT_UINT64, DT_RESOURCE, DT_STRING, \n +DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32. + +*@attention Constraints:\n +*OrderedMapPeek runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(OrderedMapPeek) .INPUT(key, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT32})) - .DYNAMIC_OUTPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, - DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, - DT_DOUBLE, DT_BOOL, DT_UINT32, DT_UINT64})) + .DYNAMIC_OUTPUT(values, + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -457,12 +1398,41 @@ REG_OP(OrderedMapPeek) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapPeek) +/** +*@brief Removes and returns the (key, value) element with the smallest. + +*@par Inputs: +*Including: \n +* @li indices: A Tensor of type DT_INT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of DTypes that has length >= 1. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*@li key: A Tensor of type DT_INT64. +*@li values: A list of Tensor objects. Must be one of the following types: \n +DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, \n +DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, DT_UINT64, DT_RESOURCE, DT_STRING, \n +DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32. + +*@attention Constraints:\n +*OrderedMapUnstageNoKey runs on the Ascend AI CPU, \n +which delivers poor performance.\n + +*/ + REG_OP(OrderedMapUnstageNoKey) .INPUT(indices, TensorType({DT_INT32})) .OUTPUT(key, TensorType({DT_INT64})) - .DYNAMIC_OUTPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, - DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, - DT_DOUBLE, DT_BOOL, DT_UINT32, DT_UINT64})) + .DYNAMIC_OUTPUT(values, + TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_UINT32, \ + DT_UINT64, DT_RESOURCE, DT_STRING, DT_COMPLEX64, DT_COMPLEX128, \ + DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32 })) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(dtypes, ListType, {}) @@ -470,6 +1440,31 @@ REG_OP(OrderedMapUnstageNoKey) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapUnstageNoKey) +/** +*@brief Removes and returns the values associated with the key. + +*@par Inputs: +*Including: \n +* @li key: A Tensor of type DT_INT64. +* @li indices: A Tensor of type DT_INT32. + +*@par Attributes: +*@li capacity: An optional int that is >= 0. Defaults to "0". +*@li memory_limit: An optional int that is >= 0. Defaults to "0". +*@li dtypes: A list of tf.DTypes that has length >= 1. +*@li container: An optional string. Defaults to "". +*@li shared_name: An optional string. Defaults to "". + +*@par Outputs: +*values: A list of Tensor objects. Must be one of the following types: \n +DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT, \n +DT_FLOAT16, DT_DOUBLE, DT_BOOL, DT_UINT32, DT_UINT64. + +*@attention Constraints:\n +*OrderedMapUnstage runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(OrderedMapUnstage) .INPUT(key, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT32})) @@ -483,6 +1478,32 @@ REG_OP(OrderedMapUnstage) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(OrderedMapUnstage) +/** +*@brief A barrier represents a key-value map, where each key is a string, \n +and each value is a tuple of tensors. + +*@par Attributes: +*@li component_types: The type of each component in a value. +*@li shapes: A list of shapes for each component of a queue element. +Each shape must be 1 in the first dimension. \n +The length of this attr must be the same as \n +the length of "component_types". +*@li capacity: The capacity of the barrier. \n +The default capacity is MAX_INT32, \n +which is the largest capacity of the underlying queue. +*@li container: If non-empty, this barrier is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: If non-empty, this barrier will be shared under \n +the given name across multiple sessions. + +*@par Outputs: +*handle: A Tensor of type DT_STRING_REF. The handle to the barrier. + +*@attention Constraints:\n +*Barrier runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(Barrier) .OUTPUT(handle, TensorType({DT_STRING_REF})) .REQUIRED_ATTR(component_types, ListType) @@ -492,16 +1513,73 @@ REG_OP(Barrier) .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(Barrier) +/** +*@brief For each key, assigns the respective value to the specified component. + +*@par Inputs: +*Including: \n +* @li handle: A Tensor of type DT_STRING_REF. The handle to a barrier. +* @li keys: A Tensor of type DT_STRING. A 1D tensor of keys. +* @li values: An any-dimensional tensor of values, which are associated \n +with the respective keys. The 0th dimension must have length n \n +Must be one of the following types: DT_FLOAT, DT_FLOAT16, DT_INT8, \n +DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \n +DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING. + +*@par Attributes: +*component_index: The component of the barrier elements that is being assigned. + +*@attention Constraints:\n +*BarrierInsertMany runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(BarrierInsertMany) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(keys, TensorType({DT_STRING})) .INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ - DT_UINT32, DT_UINT64})) + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .REQUIRED_ATTR(component_index, Int) .OP_END_FACTORY_REG(BarrierInsertMany) +/** +*@brief Takes the given number of completed elements from a barrier. + +*@par Inputs: +*Including: \n +* @li handle: A Tensor of type DT_STRING_REF. The handle to a barrier. +* @li num_elements: A Tensor of type DT_INT32. \n +A single-element tensor containing the number of elements to take. + +*@par Attributes: +*@li component_types: The type of each component in a value. +*@li allow_small_batch: Allow to return less than "num_elements" \n +items if barrier is already closed. +*@li wait_for_incomplete: An any-dimensional tensor \n +for each component in the barrier element. +*@li timeout_ms: If the queue is empty, this operation will block for up to \n +"timeout_ms" milliseconds. Note: This option is not supported yet. + +*@par Outputs: +*@li indices: A 1D tensor of type DT_INT64. The indices, with length "num_elems". \n +These indices refer to the batch in which the values were \n +placed into the barrier. +*@li keys: A 1D tensor of keys, \n +with length "num_elements" of type DT_STRING. +*@li values: A 1D tensor per component in a barrier element. \n +All values have length "num_elements" along the 0th dimension. \n +Must be one of the following types: \n +DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, \n +DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, \n +DT_RESOURCE, DT_STRING. + +*@attention Constraints:\n +*BarrierTakeMany runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(BarrierTakeMany) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(num_elements, TensorType(DT_INT32)) @@ -510,28 +1588,98 @@ REG_OP(BarrierTakeMany) .DYNAMIC_OUTPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ - DT_UINT32, DT_UINT64})) + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .REQUIRED_ATTR(component_types, ListType) .ATTR(allow_small_batch, Bool, false) .ATTR(wait_for_incomplete, Bool, false) .ATTR(timeout_ms, Int, -1) .OP_END_FACTORY_REG(BarrierTakeMany) +/** +*@brief Closes the given barrier. + +*@par Inputs: +*Including: \n +*handle: A Tensor of type DT_STRING_REF. The handle to a barrier. + +*@par Attributes: +*cancel_pending_enqueues: If true, all pending enqueue requests \n +that are blocked on the barrier's queue will \n +be canceled. InsertMany will fail, \n +even if no new key is introduced. + +*@attention Constraints:\n +*BarrierClose runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(BarrierClose) .INPUT(handle, TensorType({DT_STRING_REF})) .ATTR(cancel_pending_enqueues, Bool, false) .OP_END_FACTORY_REG(BarrierClose) +/** +*@brief Computes the number of complete elements in the given barrier. + +*@par Inputs: +*Including: \n +*handle: A Tensor of type DT_STRING_REF. The handle to a barrier. + +*@par Outputs: +*size: A Tensor of type DT_INT32. The number of complete elements. + +*@attention Constraints:\n +*BarrierReadySize runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(BarrierReadySize) .INPUT(handle, TensorType({DT_STRING_REF})) .OUTPUT(size, TensorType(DT_INT32)) .OP_END_FACTORY_REG(BarrierReadySize) +/** +*@brief Computes the number of incomplete elements in the given barrier. + +*@par Inputs: +*Including: \n +*handle: A Tensor of type DT_STRING_REF. The handle to a barrier. + +*@par Outputs: +*size: A Tensor of type DT_INT32. The number of incomplete elements in the barrier. + +*@attention Constraints:\n +*BarrierIncompleteSize runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(BarrierIncompleteSize) .INPUT(handle, TensorType({DT_STRING_REF})) .OUTPUT(size, TensorType(DT_INT32)) .OP_END_FACTORY_REG(BarrierIncompleteSize) +/** +*@brief Emits randomized records. + +*@par Attributes: +*@li file_pattern: A string. Glob pattern for the data files. +*@li file_random_seed: An optional int. Defaults to 301. Random seeds used to \n +produce randomized records. +*@li file_shuffle_shift_ratio: An optional float. Defaults to 0. Shifts the \n +list of files after the list is randomly shuffled. +*@li file_buffer_size: An optional int. Defaults to 10000. The randomization \n +shuffling buffer. +*@li file_parallelism: An optional int. Defaults to 16. How many sstables are \n +opened and concurrently iterated over. +*@li batch_size: An optional int. Defaults to 32. The batch size. +*@li compression_type: An optional string. Defaults to "". The type of \n +compression for the file. Currently ZLIB and GZIP are supported. + +*@par Outputs: +*records: A Tensor of type string. + +*/ + REG_OP(RecordInput) .OUTPUT(records, TensorType({DT_STRING})) .REQUIRED_ATTR(file_pattern, String) @@ -543,6 +1691,26 @@ REG_OP(RecordInput) .ATTR(compression_type, String, "") .OP_END_FACTORY_REG(RecordInput) +/** +*@brief A conditional accumulator for aggregating gradients. + +*@par Attributes: +*@li dtype: The type of the value being accumulated. +*@li shape: The shape of the values, can be [], in which case shape is unknown. +*@li container: If non-empty, this accumulator is placed in the given container. \n +Otherwise, a default container is used. +*@li shared_name: If non-empty, this accumulator will be shared under the given \n +name across multiple sessions. +*@li reduction_type: reduction operator type, default "MEAN". + +*@par Outputs: +*handle: A Tensor of type DT_STRING_REF. The handle to the accumulator. + +*@attention Constraints:\n +*ConditionalAccumulator runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(ConditionalAccumulator) .OUTPUT(handle, TensorType({DT_STRING_REF})) .REQUIRED_ATTR(dtype, Type) @@ -552,32 +1720,125 @@ REG_OP(ConditionalAccumulator) .ATTR(reduction_type, String, "MEAN") .OP_END_FACTORY_REG(ConditionalAccumulator) +/** +*@brief Applies a gradient to a given accumulator. + +*@par Inputs: +*Does not add if "local_step" is lesser than the accumulator's "global_step". \n +* @li handle: A Tensor of type DT_STRING_REF. The handle to an accumulator. +* @li local_step: A Tensor of type DT_INT64. \n +The "local_step" value at which the gradient was computed. + +* @li gradient: A tensor of the gradient to be accumulated. \n +Must be one of the following types: \n +DT_FLOAT16, DT_FLOAT, DT_DOUBLE + +*@par Attributes: +*dtype: Must be one of the following types: \n +DT_FLOAT16, DT_FLOAT, DT_DOUBLE + +*@attention Constraints:\n +*AccumulatorApplyGradient runs on the Ascend AI CPU, \n +which delivers poor performance.\n + +*/ + REG_OP(AccumulatorApplyGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(local_step, TensorType({DT_INT64})) - .INPUT(gradient, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT})) + .INPUT(gradient, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(AccumulatorApplyGradient) +/** +*@brief Returns the number of gradients aggregated in the given accumulators. + +*@par Inputs: +*Including: \n +*handle: A Tensor of type DT_STRING_REF. The handle to an accumulator. + +*@par Outputs: +*y: A Tensor of type DT_INT32. The number of gradients aggregated \n +in the given accumulator. + +*@attention Constraints:\n +*AccumulatorNumAccumulated runs on the Ascend AI CPU, \n +which delivers poor performance.\n + +*/ + REG_OP(AccumulatorNumAccumulated) .INPUT(handle, TensorType({DT_STRING_REF})) .OUTPUT(y, TensorType({DT_INT32})) .OP_END_FACTORY_REG(AccumulatorNumAccumulated) +/** +*@brief Updates the accumulator with a new value for "global_step". + +*@par Inputs: +*Input "new_global_step" is a scalar. \n +* @li handle: A Tensor of type DT_STRING_REF. The handle to an accumulator. +* @li new_global_step: The new "global_step" value to set A Tensor of type DT_INT64. + +*@attention Constraints:\n +*AccumulatorSetGlobalStep runs on the Ascend AI CPU, which delivers poor performance.\n + +*/ + REG_OP(AccumulatorSetGlobalStep) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(new_global_step, TensorType({DT_INT64})) .OP_END_FACTORY_REG(AccumulatorSetGlobalStep) +/** +*@brief Extracts the average gradient in the given ConditionalAccumulator. + +*@par Inputs: +* Input "num_required" is a scalar. \n +* @li handle: A Tensor of type DT_STRING_REF. The handle to an accumulator. +* @li num_required: A Tensor of type DT_INT32. \n +Number of gradients required before an aggregate is returned. + +*@par Attributes: +*dtype: The data type of accumulated gradients. \n +Needs to correspond to the type of the accumulator. + +*@par Outputs: +*y: The average of the accumulated gradients. \n +Must be one of the following types: +DT_FLOAT16, DT_FLOAT, DT_DOUBLE. + +*@attention Constraints:\n +*AccumulatorTakeGradient runs on the Ascend AI CPU, +\nwhich delivers poor performance.\n + +*/ + REG_OP(AccumulatorTakeGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(num_required, TensorType({DT_INT32})) - .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_DOUBLE, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(AccumulatorTakeGradient) +/** +*@brief A conditional accumulator for aggregating sparse gradients. + +*@par Attributes: +*@li shape: The shape of the values. +*@li dtype: The type of the value being accumulated. +*@li container: If non-empty, this accumulator is placed in the given \n +container. Otherwise, a default container is used. +*@li shared_name: If non-empty, this accumulator will be shared under the \n +given name across multiple sessions. +*@li reduction_type: The reduction method whose type is string, \n +default is "MEAN". + +*@par Outputs: +*handle: The handle to the accumulator. + +*/ + REG_OP(SparseConditionalAccumulator) .OUTPUT(handle, TensorType({DT_STRING_REF})) .REQUIRED_ATTR(shape, ListInt) @@ -587,17 +1848,61 @@ REG_OP(SparseConditionalAccumulator) .ATTR(reduction_type, String, "MEAN") .OP_END_FACTORY_REG(SparseConditionalAccumulator) +/** +*@brief Applies a sparse gradient to a given accumulator. + +*@par Inputs: +*The input handle must be type string_ref. Inputs include: \n +*@li handle: A Tensor of type mutable string. The handle to a accumulator. +*@li local_step: A Tensor of type int64. The local_step value at which the \n +sparse gradient was computed. +*@li indices: A Tensor of type int64. Indices of the sparse gradient to be \n +accumulated. Must be a vector. +*@li values: A Tensor. Values are the non-zero slices of the gradient, \n +and must have the same first dimension as indices, i.e., the nnz represented \n +by indices and values must be consistent. +*@li shape: A Tensor of type int64. + +*@par Attributes: +*@li has_known_shape: A bool. Boolean indicating whether gradient_shape is \n +unknown, in which case the input is ignored during validation. +*@li dtype: The data type of accumulated gradients. Needs to correspond to \n +the type of the accumulator. + +*/ + REG_OP(SparseAccumulatorApplyGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(local_step, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT})) + DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT,DT_FLOAT16, DT_UINT32, \ + DT_UINT64, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ + DT_QINT8, DT_QUINT8, DT_QINT32})) .INPUT(shape, TensorType({DT_INT64})) .REQUIRED_ATTR(has_known_shape, Bool) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(SparseAccumulatorApplyGradient) +/** +*@brief Extracts the average sparse gradient in a SparseConditionalAccumulator. + +*@par Inputs: +*The input handle must be type string_ref. Inputs include: \n +*@li handle: The handle to a SparseConditionalAccumulator. +*@li num_required: Number of gradients required before we return an aggregate. + +*@par Attributes: +*dtype: The data type of accumulated gradients. Needs to correspond to the \n +type of the accumulator. + +*@par Outputs: +*@li indices: Indices of the average of the accumulated sparse gradients. +*@li values: Values of the average of the accumulated sparse gradients. +*@li shape: Shape of the average of the accumulated sparse gradients. + +*/ + REG_OP(SparseAccumulatorTakeGradient) .INPUT(handle, TensorType({DT_STRING_REF})) .INPUT(num_required, TensorType({DT_INT32})) diff --git a/third_party/fwkacllib/inc/ops/decode_bbox.h b/third_party/fwkacllib/inc/ops/decode_bbox.h new file mode 100644 index 00000000..9fe95488 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/decode_bbox.h @@ -0,0 +1,33 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_DECODE_BBOX_H + #define GE_OP_DECODE_BBOX_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(DecodeBbox) + .INPUT(box_predictions, TensorType{DT_FLOAT16}) + .INPUT(anchors, TensorType{DT_FLOAT16}) + .OUTPUT(decoded_boxes, TensorType{DT_FLOAT16}) + .REQUIRED_ATTR(decode_clip, Float) + .OP_END_FACTORY_REG(DecodeBbox) + + } // namespace ge + + #endif // GE_OP_DECODE_BBOX_H diff --git a/third_party/fwkacllib/inc/ops/decode_boundaries_target.h b/third_party/fwkacllib/inc/ops/decode_boundaries_target.h new file mode 100644 index 00000000..f6951f9a --- /dev/null +++ b/third_party/fwkacllib/inc/ops/decode_boundaries_target.h @@ -0,0 +1,31 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_DECODE_BOUNDARIES_TARGET_H + #define GE_OP_DECODE_BOUNDARIES_TARGET_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(DecodeBoundariesTarget) + .INPUT(boundary_predictions, TensorType({DT_FLOAT16})) /* "First operand." */ + .INPUT(anchors, TensorType({DT_FLOAT16})) /* "Second operand." */ + .OUTPUT(boundary_encoded, TensorType({DT_FLOAT16})) /* "Result, has same element type as two inputs" */ + .OP_END_FACTORY_REG(DecodeBoundariesTarget) + } // namespace ge + + #endif // GE_OP_DECODE_BOUNDARIES_TARGET_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/decode_cornerpoints_target_bg.h b/third_party/fwkacllib/inc/ops/decode_cornerpoints_target_bg.h new file mode 100644 index 00000000..ce10175f --- /dev/null +++ b/third_party/fwkacllib/inc/ops/decode_cornerpoints_target_bg.h @@ -0,0 +1,31 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_DECODE_CORNERPOINTS_TARGET_BG_H + #define GE_OP_DECODE_CORNERPOINTS_TARGET_BG_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(DecodeCornerpointsTargetBG) + .INPUT(keypoints_prediction, TensorType({DT_FLOAT16})) /* "First operand." */ + .INPUT(anchors, TensorType({DT_FLOAT16})) /* "Second operand." */ + .OUTPUT(keypoints_decoded, TensorType({DT_FLOAT16})) /* "Result, has same element type as two inputs" */ + .OP_END_FACTORY_REG(DecodeCornerpointsTargetBG); + } // namespace ge + + #endif // GE_OP_DECODE_CORNERPOINTS_TARGET_BG_H diff --git a/third_party/fwkacllib/inc/ops/decode_cornerpoints_target_wrt_center_v1.h b/third_party/fwkacllib/inc/ops/decode_cornerpoints_target_wrt_center_v1.h new file mode 100644 index 00000000..0e96bc16 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/decode_cornerpoints_target_wrt_center_v1.h @@ -0,0 +1,32 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_DECODE_CORNERPOINTS_TARGET_WRT_CENTER_V1_H + #define GE_OP_DECODE_CORNERPOINTS_TARGET_WRT_CENTER_V1_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(DecodeCornerpointsTargetWrtCenterV1) + .INPUT(keypoints_prediction, TensorType({DT_FLOAT16})) /* "First operand." */ + .INPUT(anchors, TensorType({DT_FLOAT16})) /* "Second operand." */ + .OUTPUT(keypoints_decoded, TensorType({DT_FLOAT16})) /* "Result, has same element type as two inputs" */ + .OP_END_FACTORY_REG(DecodeCornerpointsTargetWrtCenterV1) + } // namespace ge + + #endif // GE_OP_DECODE_CORNERPOINTS_TARGET_WRT_CENTER_V1_H + diff --git a/third_party/fwkacllib/inc/ops/decode_wheels_target.h b/third_party/fwkacllib/inc/ops/decode_wheels_target.h new file mode 100644 index 00000000..053a6c1a --- /dev/null +++ b/third_party/fwkacllib/inc/ops/decode_wheels_target.h @@ -0,0 +1,31 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_DECODE_WHEELS_TARGET_H + #define GE_OP_DECODE_WHEELS_TARGET_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(DecodeWheelsTarget) + .INPUT(boundary_predictions, TensorType({DT_FLOAT16})) /* "First operand." */ + .INPUT(anchors, TensorType({DT_FLOAT16})) /* "Second operand." */ + .OUTPUT(boundary_encoded, TensorType({DT_FLOAT16})) /* "Result, has same element type as two inputs" */ + .OP_END_FACTORY_REG(DecodeWheelsTarget) + } // namespace ge + + #endif // GE_OP_DECODE_WHEELS_TARGET_H diff --git a/third_party/fwkacllib/inc/ops/dvpp_ops.h b/third_party/fwkacllib/inc/ops/dvpp_ops.h deleted file mode 100644 index 98294c14..00000000 --- a/third_party/fwkacllib/inc/ops/dvpp_ops.h +++ /dev/null @@ -1,62 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GE_OP_DVPP_OPS_H_ -#define GE_OP_DVPP_OPS_H_ - -#include "graph/operator_reg.h" - -namespace ge { - -REG_OP(DvppCreateChannel) - .OUTPUT(dvpp_channel, TensorType({DT_INT64})) - .OP_END_FACTORY_REG(DvppCreateChannel) - -REG_OP(DvppDestroyChannel) - .INPUT(dvpp_channel, TensorType({DT_INT64})) - .OP_END_FACTORY_REG(DvppDestroyChannel) - -REG_OP(DvppResize) - .INPUT(dvpp_channel, TensorType({DT_INT64})) - .INPUT(input_desc, TensorType({DT_UINT8})) - .INPUT(output_desc, TensorType({DT_UINT8})) - .INPUT(resize_config, TensorType({DT_UINT8})) - .OP_END_FACTORY_REG(DvppResize) - -REG_OP(DvppCrop) - .INPUT(dvpp_channel, TensorType({DT_INT64})) - .INPUT(input_desc, TensorType({DT_UINT8})) - .INPUT(output_desc, TensorType({DT_UINT8})) - .INPUT(crop_area, TensorType({DT_UINT8})) - .OP_END_FACTORY_REG(DvppCrop) - -REG_OP(DvppCropAndPaste) - .INPUT(dvpp_channel, TensorType({DT_INT64})) - .INPUT(input_desc, TensorType({DT_UINT8})) - .INPUT(output_desc, TensorType({DT_UINT8})) - .INPUT(crop_area, TensorType({DT_UINT8})) - .INPUT(paste_area, TensorType({DT_UINT8})) - .OP_END_FACTORY_REG(DvppCropAndPaste) - -REG_OP(DvppDecodeJpeg) - .INPUT(dvpp_channel, TensorType({DT_INT64})) - .INPUT(input_desc, TensorType({DT_UINT8})) - .INPUT(output_desc, TensorType({DT_UINT8})) - .INPUT(decode_area, TensorType({DT_UINT8})) - .OP_END_FACTORY_REG(DvppDecodeJpeg) -} // namespace ge - -#endif // GE_OP_DVPP_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h index f92f42eb..d5272805 100644 --- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h @@ -24,8 +24,8 @@ namespace ge { *@par Inputs: *Dynamic inputs, including: -* @li x: A list of Tensor objects, each with same shape and type. The supported types are: -* float16, float32, double, int32, uint8, int16, int8, complex64, int64, +* @li x: A list of Tensor objects, each with same shape and type. The supported types are: +* float16, float32, double, int32, uint8, int16, int8, complex64, int64, * qint8, quint8, qint32, uint16, complex128, uint32, uint64. *@par Outputs: @@ -104,7 +104,7 @@ REG_OP(MinimumGrad) .OP_END_FACTORY_REG(MinimumGrad) /** -*@brief: Cast a tensor form src data type to dst data type. +*@brief Cast a tensor form src data type to dst data type. *@par Inputs: *One input:\n @@ -117,17 +117,6 @@ REG_OP(MinimumGrad) *@par Outputs: *y:A `Tensor`. Has the same type as `x`. - -*@par Quantization supported or not -*Not supported - -*@par Quantized inference supported or not -*Not supported - -*@par Multiple batches supported or not -*Supported - -*@since V100R001C33 */ REG_OP(Cast) .INPUT(x, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8, @@ -296,7 +285,7 @@ REG_OP(Sub) .OP_END_FACTORY_REG(Sub) /** -*@ brief computes the absolute value of a tensor. +*@brief computes the absolute value of a tensor. *@par Inputs: *One inputs, including: @@ -388,8 +377,8 @@ REG_OP(Cos) *@par Inputs: * Two inputs, including: -*@li x1: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8 -*@li x2: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8 +*@li x1: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8, float64, int64, uint16, int16, complex64, complex128 +*@li x2: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8, float64, int64, uint16, int16, complex64, complex128 *@par Outputs: * y: A Tensor. Has the same type and format as input "x1". @@ -435,7 +424,7 @@ REG_OP(Equal) *@par Inputs: *One input:\n -*x: A Tensor. Must be one of the following types: float16, float32, double, complex64, complex128. +*x: A Tensor. Must be one of the following types: float16, float32, float64, complex64, complex128. *@par Attributes: *@li base: An optional attribute of type float32, specifying the base gamma. Defaults to "-1". @@ -1446,10 +1435,13 @@ REG_OP(ApproximateEqual) *@par Outputs: * y: A tensor. Has the same type as "x". * +*@par Attributes: +* N: the size of x. */ REG_OP(AccumulateNV2) .DYNAMIC_INPUT(x, TensorType::NumberType()) .OUTPUT(y, TensorType::NumberType()) + .REQUIRED_ATTR(N, Int) .OP_END_FACTORY_REG(AccumulateNV2) /** @@ -1563,8 +1555,6 @@ REG_OP(FakeQuantWithMinMaxVars) *@see Region() -*@par Third-party framework compatibility -* Compatible with the operator FakeQuantWithMinVarsGradient. */ REG_OP(FakeQuantWithMinMaxVarsGradient) .INPUT(gradients, TensorType({DT_FLOAT})) @@ -1915,16 +1905,16 @@ REG_OP(BiasAdd) *@li dimension: A Tensor. Must be one of the following types: int32, int64. Must be in the range [-rank(input x), rank(input x)]. Describes which dimension of the input Tensor to reduce across. \n * The format is ND. *@par Attributes: -*output_type: The output type, either "int32" or "int64". Defaults to "int64". +*dtype: The output type, either "int32" or "int64". Defaults to "int64". *@par Outputs: -*y: A Tensor of type "output_type". +*y: A Tensor of type "dtype". */ REG_OP(ArgMin) .INPUT(x, TensorType::NumberType()) .INPUT(dimension, TensorType::IndexNumberType()) .OUTPUT(y, TensorType({DT_INT32, DT_INT64})) - .ATTR(output_type, Type, DT_INT64) + .ATTR(dtype, Type, DT_INT64) .OP_END_FACTORY_REG(ArgMin) /** @@ -1937,16 +1927,16 @@ REG_OP(ArgMin) *@par Attributes: *@li dimension: The dimension of the input Tensor to reduce across. -*@li output_type: An optional attribute, specifying the output data type. Must be "int32". Defaults to "int64". +*@li dtype: An optional attribute, specifying the output data type. Must be "int32". Defaults to "int64". *@par Outputs: -*y: A Tensor of type output_type. +*y: A Tensor of type dtype. */ REG_OP(ArgMinD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) .OUTPUT(y, TensorType({DT_INT32})) .REQUIRED_ATTR(dimension, Int) - .ATTR(output_type, Type, DT_INT64) + .ATTR(dtype, Type, DT_INT64) .OP_END_FACTORY_REG(ArgMinD) /** @@ -1964,12 +1954,12 @@ REG_OP(ArgMinD) *@li x: If there are multiple maximum values, the index of the first maximum value is used. *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". */ -REG_OP(ArgMax) +REG_OP(ArgMaxV2) .INPUT(x, TensorType::NumberType()) .INPUT(dimension, TensorType::IndexNumberType()) .OUTPUT(y, TensorType({DT_INT32, DT_INT64})) - .ATTR(output_type, Type, DT_INT64) - .OP_END_FACTORY_REG(ArgMax) + .ATTR(dtype, Type, DT_INT64) + .OP_END_FACTORY_REG(ArgMaxV2) /** *@brief Returns the index with the largest value across axes of a tensor. @@ -1992,7 +1982,7 @@ REG_OP(ArgMaxD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) .OUTPUT(y, TensorType({DT_INT32})) .REQUIRED_ATTR(dimension, Int) - .ATTR(output_type, Type, DT_INT64) + .ATTR(dtype, Type, DT_INT64) .OP_END_FACTORY_REG(ArgMaxD) /** @@ -2053,20 +2043,24 @@ REG_OP(ArgMinWithValue) *@par Inputs: *One input: \n -*x: the list of input data, the type of element is Tensor that \n -* should met one of the following types: -* float16, float32 +*x: the list of input data, the type of element in Tensor should be same. +* the max size of x is 32. +* should met one of the following types: float16, float32 + +*@par Outputs: +*y: A Tensor. Has the same type and format as "x". *@par Attributes: +*@li N: A required attribute. the number of input x, max size is 32. *@li model: An optional attribute. Defaults to "1". * "0": product, "1": sum, "2": max. *@li coeff: A required attribute. Must met all of following rules: -* size of "coeff" must be equal to len("x"). -* element of "coeff" must be a number. +* size of "coeff" must be equal to len("x") or is null. */ REG_OP(Eltwise) - .DYNAMIC_INPUT(__input, TensorType({DT_FLOAT16, DT_FLOAT})) + .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .REQUIRED_ATTR(N, Int) .ATTR(mode, Int, 1) .ATTR(coeff, ListFloat, {}) .OP_END_FACTORY_REG(Eltwise) @@ -2090,139 +2084,92 @@ REG_OP(PopulationCount) *@par Inputs: *Thirteen inputs, including: -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor. Must be one of the following types: float16, float32. -* @li input3: A Tensor. Must be one of the following types: float16, float32. -* @li input4: A Tensor. Must be one of the following types: float16, float32. -* @li input5: A Tensor. Must be one of the following types: float16, float32. -* @li input6: A Tensor. Must be one of the following types: float16, float32. -* @li input7: A Tensor. Must be one of the following types: float16, float32. -* @li input8: A Tensor. Must be one of the following types: float16, float32. -* @li input9: A Tensor. Must be one of the following types: float16, float32. -* @li inputx0: A Tensor. Must be one of the following types: float16, float32. -* @li inputx1: A Tensor. Must be one of the following types: float16, float32. -* @li inputx2: A Tensor. Must be one of the following types: float16, float32. -* @li inputx3: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul3: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul2: A Tensor. Must be one of the following types: float16, float32. +* @li input_realdiv1: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul1: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul0: A Tensor. Must be one of the following types: float16, float32. +* @li input_realdiv0: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul4: A Tensor. Must be one of the following types: float16, float32. +* @li mul0_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul1_sub: A Tensor. Must be one of the following types: float16, float32. +* @li mul2_x: A Tensor. Must be one of the following types: float16, float32. +* @li mul3_sub1: A Tensor. Must be one of the following types: float16, float32. +* @li mul4_x: A Tensor. Must be one of the following types: float16, float32. +* @li add2_y: A Tensor. Must be one of the following types: float16, float32. *@par Outputs: *Four outputs, including: -* @li output1: A Tensor. Must be one of the following types: float16, float32. -* @li output2: A Tensor. Must be one of the following types: float16, float32. -* @li output3: A Tensor. Must be one of the following types: float16, float32. -* @li output4: A Tensor. Must be one of the following types: float16, float32. +* @li y1: A Tensor. Must be one of the following types: float16, float32. +* @li y2: A Tensor. Must be one of the following types: float16, float32. +* @li y3: A Tensor. Must be one of the following types: float16, float32. +* @li y4: A Tensor. Must be one of the following types: float16, float32. */ REG_OP(LambNextMVWithDecay) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input4, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input5, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input6, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input7, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input8, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input9, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx0, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output1, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output2, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output4, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul3, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_realdiv1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_realdiv0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul4, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul0_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul1_sub, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul2_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul3_sub1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul4_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(add2_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y1, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y2, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y3, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y4, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(LambNextMVWithDecay) -/** -*@brief A fusion operator for bert lamb. - -*@par Inputs: -*Thirteen inputs, including: -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor. Must be one of the following types: float16, float32. -* @li input3: A Tensor. Must be one of the following types: float16, float32. -* @li input4: A Tensor. Must be one of the following types: float16, float32. -* @li input5: A Tensor. Must be one of the following types: float16, float32. -* @li input6: A Tensor. Must be one of the following types: float16, float32. -* @li input7: A Tensor. Must be one of the following types: float16, float32. -* @li input8: A Tensor. Must be one of the following types: float16, float32. -* @li input9: A Tensor. Must be one of the following types: float16, float32. -* @li inputx0: A Tensor. Must be one of the following types: float16, float32. -* @li inputx1: A Tensor. Must be one of the following types: float16, float32. -* @li inputx2: A Tensor. Must be one of the following types: float16, float32. -* @li inputx3: A Tensor. Must be one of the following types: float16, float32. - -*@par Outputs: -*Four outputs, including: -* @li output1: A Tensor. Must be one of the following types: float16, float32. -* @li output2: A Tensor. Must be one of the following types: float16, float32. -* @li output3: A Tensor. Must be one of the following types: float16, float32. -* @li output4: A Tensor. Must be one of the following types: float16, float32. - -*/ -REG_OP(LambNextMVWithDecayV1) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input4, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input5, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input6, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input7, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input8, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input9, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx0, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output1, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output2, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output4, TensorType({DT_FLOAT16,DT_FLOAT})) - .OP_END_FACTORY_REG(LambNextMVWithDecayV1) - /** *@brief Confuse real_div, rsqrt, sqrt, maximum, minimum, sub and add. *@par Inputs: *Thirteen inputs, including: -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor of the same type as "input1". -* @li input3: A Tensor of the same type as "input1". -* @li input4: A Tensor of the same type as "input1". -* @li input5: A Tensor of the same type as "input1". -* @li input6: A Tensor. Must be one of the following types: float16, float32. -* @li input7: A Tensor of the same type as "input1". -* @li input8: A Tensor of the same type as "input1". -* @li input9: A Tensor of the same type as "input1". -* @li inputx0: A Tensor of the same type as "input1". -* @li inputx1: A Tensor. Must be one of the following types: float16, float32. -* @li inputx2: A Tensor of the same type as "input1". -* @li inputx3: A Tensor of the same type as "input1". +* @li input_mul3: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul2: A Tensor of the same type as "input1". +* @li input_realdiv1: A Tensor of the same type as "input1". +* @li input_mul1: A Tensor of the same type as "input1". +* @li input_mul0: A Tensor of the same type as "input1". +* @li input_realdiv0: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul4: A Tensor of the same type as "input1". +* @li mul0_x: A Tensor of the same type as "input1". +* @li mul1_sub: A Tensor of the same type as "input1". +* @li mul2_x: A Tensor of the same type as "input1". +* @li mul3_sub1: A Tensor. Must be one of the following types: float16, float32. +* @li mul4_x: A Tensor of the same type as "input1". +* @li add2_y: A Tensor of the same type as "input1". *@par Outputs: *Four outputs, including: -*@li output1: A Tensor. Has the same type as "input1". -*@li output2: A Tensor. Has the same type as "input1". -*@li output3: A Tensor. Has the same type as "input1". -*@li output4: A Tensor. Has the same type as "input1". +*@li y1: A Tensor. Has the same type as "input_mul3". +*@li y2: A Tensor. Has the same type as "input_mul3". +*@li y3: A Tensor. Has the same type as "input_mul3". +*@li y4: A Tensor. Has the same type as "input_mul3". */ REG_OP(LambNextMV) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input4, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input5, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input6, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input7, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input8, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input9, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx0, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(inputx3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output1, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output2, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output4, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul3, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_realdiv1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_realdiv0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul4, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul0_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul1_sub, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul2_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul3_sub1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(mul4_x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(add2_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y1, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y2, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y3, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y4, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(LambNextMV) /** @@ -2230,8 +2177,8 @@ REG_OP(LambNextMV) *@par Inputs: *Six inputs, including: -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor. Must be one of the following types: float16, float32. +* @li input_square: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul2: A Tensor. Must be one of the following types: float16, float32. * @li mul2_x: A Tensor. Must be one of the following types: float16, float32. * @li mul3_x: A Tensor. Must be one of the following types: float16, float32. * @li truediv1_recip: A Tensor. Must be one of the following types: float16, float32. @@ -2239,77 +2186,51 @@ REG_OP(LambNextMV) *@par Outputs: *Two outputs, including: -* @li output1: A Tensor of the same type as "input1". -* @li output2: A Tensor of the same type as "input1". +* @li y1: A Tensor of the same type as "input_square". +* @li y2: A Tensor of the same type as "input_square". */ REG_OP(LambNextRight) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_square, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul2, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(mul2_x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(mul3_x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(truediv1_recip, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(add2_y, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output1, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output2, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y1, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y2, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(LambNextRight) -/** -*@brief Confuse broadcast, add and mul. - -*@par Inputs: -*Five inputs, including: -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor of the same type as "input1". -* @li mul_x: A Tensor of the same type as "input1". -* @li mul1_x: A Tensor of the same type as "input1". -* @li truediv_recip: A Tensor of the same type as "input1". - -*@par Outputs: -*Two outputs, including: -*@li output1: A Tensor. Has the same type as "input1". -*@li output2: A Tensor. Has the same type as "input1". -*/ -REG_OP(LambNextLeft) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(mul_x, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(mul1_x, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(truediv_recip, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output1, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output2, TensorType({DT_FLOAT16,DT_FLOAT})) - .OP_END_FACTORY_REG(LambNextLeft) - /** *@brief A fusion operator for bert lamb. *@par Inputs: *Six inputs, including: -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor. Must be one of the following types: float16, float32. -* @li input3: A Tensor. Must be one of the following types: float16, float32. -* @li input4: A Tensor. Must be one of the following types: float16, float32. -* @li input5: A Tensor. Must be one of the following types: float16, float32. -* @li input6: A Tensor. Must be one of the following types: float16, float32. -* @li input7: A Tensor. Must be one of the following types: float16, float32. -* @li input8: A Tensor. Must be one of the following types: float16, float32. -* @li input9: A Tensor. Must be one of the following types: float16, float32. +* @li input_greater1: A Tensor. Must be one of the following types: float16, float32. +* @li input_greater_realdiv: A Tensor. Must be one of the following types: float16, float32. +* @li input_realdiv: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul0: A Tensor. Must be one of the following types: float16, float32. +* @li input_mul1: A Tensor. Must be one of the following types: float16, float32. +* @li input_sub: A Tensor. Must be one of the following types: float16, float32. +* @li greater_y: A Tensor. Must be one of the following types: float16, float32. +* @li select_e: A Tensor. Must be one of the following types: float16, float32. +* @li minimum_y: A Tensor. Must be one of the following types: float16, float32. *@par Outputs: -*output_y: A Tensor of the same type as "input1". +*y: A Tensor of the same type as "input_greater1". */ REG_OP(LambUpdateWithLr) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input4, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input5, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input6, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input7, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input8, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input9, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_greater1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_greater_realdiv, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_realdiv, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul0, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_mul1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(input_sub, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(greater_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(select_e, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(minimum_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(LambUpdateWithLr) /** @@ -2425,21 +2346,21 @@ REG_OP(AdamApplyOne) *@par Inputs: *Four inputs, including: -* @li input_x: A Tensor. Must be one of the following types: float16, float32. -* @li input1: A Tensor. Must be one of the following types: float16, float32. -* @li input2: A Tensor. Must be one of the following types: float16, float32. -* @li input3: A Tensor. Must be one of the following types: float16, float32. +* @li x: A Tensor. Must be one of the following types: float16, float32. +* @li greater_zeros: A Tensor. Must be one of the following types: float16, float32. +* @li select_ones: A Tensor. Must be one of the following types: float16, float32. +* @li maximum_ones: A Tensor. Must be one of the following types: float16, float32. *@par Outputs: -*output_y: A Tensor of the same type as "input_x". +*y: A Tensor of the same type as "x". */ REG_OP(ClipByNormNoDivSum) - .INPUT(input_x, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(output_y, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(greater_zeros, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(select_ones, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(maximum_ones, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(ClipByNormNoDivSum) /** @@ -2462,7 +2383,7 @@ REG_OP(SquareSumV2) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(y1, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(y2, TensorType({DT_FLOAT16,DT_FLOAT})) - .ATTR(axis, ListInt, {}) + .REQUIRED_ATTR(axis, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(SquareSumV2) @@ -2483,10 +2404,28 @@ y: A Tensor. Has the same type as "x". REG_OP(SquareSumV1) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) - .ATTR(axis, ListInt, {}) + .REQUIRED_ATTR(axis, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(SquareSumV1) +/** +*@brief Calculate square of Tensor and then reducesum + +*@par Inputs: +*x1: A Tensor of type float32. +*x2: A Tensor of type float32. + +*@par Outputs: +y1: A Tensor. Has the same type as "x1".The result of "x1". +y2: A Tensor. Has the same type as "x2".The result of "x2". +*/ +REG_OP(SquareSumAll) + .INPUT(x1, TensorType({DT_FLOAT})) + .INPUT(x2, TensorType({DT_FLOAT})) + .OUTPUT(y1, TensorType({DT_FLOAT})) + .OUTPUT(y2, TensorType({DT_FLOAT})) + .OP_END_FACTORY_REG(SquareSumAll) + /** *@brief Confuse broadcast, addn and mul. @@ -2549,7 +2488,7 @@ REG_OP(ConfusionMulGrad) .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(output0, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(output1, TensorType({DT_FLOAT16,DT_FLOAT})) - .ATTR(axis, ListInt, {}) + .ATTR(axes, ListInt, {}) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ConfusionMulGrad) @@ -2560,6 +2499,36 @@ REG_OP(FusedMulAddNL2loss) .OUTPUT(y1, TensorType::NumberType()) .OUTPUT(y2, TensorType::NumberType()) .OP_END_FACTORY_REG(FusedMulAddNL2loss) + + +/** +*@brief Tests whether the input exceeds a threshold. + +*@par Inputs: +*@li x: A Tensor with any format. Must be one of the following types: float16, float32. + +*@par Attributes: +*@li threshold: A required float32. Defaults to "0.0". "x" is compared with "threshold", outputs "1" for inputs above threshold; "0" otherwise. + +*@par Outputs: +*@li y: A Tensor with any format. Has the same type as the input. Must be one of the following types: float16, float32. +*/ + + REG_OP(Threshold) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(threshold, Float, 0.0) + .OP_END_FACTORY_REG(Threshold); + +REG_OP(ArgMaxWithK) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(indices, TensorType({DT_INT32, DT_FLOAT, DT_FLOAT16})) + .OUTPUT(values, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(axis, Int, 10000) + .ATTR(out_max_val, Bool, false) + .ATTR(top_k, Int, 1) + .OP_END_FACTORY_REG(ArgMaxWithK) + } // namespace ge #endif // GE_OP_ELEWISE_CALCULATION_OPS_H diff --git a/third_party/fwkacllib/inc/ops/fastrcnn_predictions.h b/third_party/fwkacllib/inc/ops/fastrcnn_predictions.h new file mode 100644 index 00000000..e7794e45 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/fastrcnn_predictions.h @@ -0,0 +1,36 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_FASTRCNN_PREDICTIONS_H + #define GE_OP_FASTRCNN_PREDICTIONS_H + + #include "graph/operator_reg.h" + + namespace ge { + + REG_OP(FastrcnnPredictions) + .INPUT(rois, TensorType({DT_FLOAT16})) + .INPUT(score, TensorType({DT_FLOAT16})) + .REQUIRED_ATTR(nms_threshold, Float) + .REQUIRED_ATTR(score_threshold, Float) + .REQUIRED_ATTR(k, Int) + .OUTPUT(sorted_rois, TensorType({DT_FLOAT16})) + .OUTPUT(sorted_scores, TensorType({DT_FLOAT16})) + .OUTPUT(sorted_classes, TensorType({DT_FLOAT16})) + .OP_END_FACTORY_REG(FastrcnnPredictions) + } // namespace ge + + #endif // GE_OP_FASTRCNN_PREDICTIONS_H diff --git a/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h b/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h new file mode 100644 index 00000000..2b3e206d --- /dev/null +++ b/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h @@ -0,0 +1,67 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_FSRDETECTIONOUTPUT_OPS_H_ +#define GE_OP_FSRDETECTIONOUTPUT_OPS_H_ +#include "graph/operator_reg.h" + +namespace ge { +/** +*@brief Returns detection result. + +*@par Inputs: +* Four inputs, including: +*@li rois: An NCHW tensor of type floa16 or float32, output from operator proposal_d at the preceding layer, used as the input of operator FSRDetectionOutput. +*@li prior_box: An NCHWC0 tensor of type floa16 or float32, specifying the prediction offset, used to update the coordinates [x1, y1, x2, y2] of each ROI. +*@li score: An NCHWC0 tensor of type floa16 or float32, specifying the probability of each class. Class 0 is the background class. +*@li actual_rois_num: An NCHW tensor of type int32, specifying the number of valid boxes per batch. +*@par Attributes: +*@li batch_rois: An optional int32, specifying the number of images to be predicted. Defaults to "1024". The value range is [1, 1024]. +*@li im_info: An optional list of two ints. Defaults to (375, 1024). The value range is [1, 1024]. +*@li num_classes: An optional int32, specifying the number of classes to be predicted. Defaults to "80". The value must be greater than 0. +*@li max_rois_num: An optional int32, specifying the maximum number of ROIs per batch. Defaults to "1024". The value must be a multiple of 16. +*@li score_thresh: An optional float32, specifying the threshold for box filtering. Defaults to 0.45. The value range is [0.0, 1.0]. +*@li nms_thresh: An optional float32, specifying the confidence threshold for box filtering, which is the output "obj" of operator Region. Defaults to 0.7. The value range is (0.0, 1.0). +*@li bbox_reg_weights: An optional list of four ints. Defaults to (1, 1, 1, 1). Must not have value "0". +*@li post_nms_topn: An optional int, specifying the number of output boxes. Defaults to "304". The value must be less than or equal to 1024 and must be a multiple of 16. +*@li kernel_name: An optional string, specifying the operator name. Defaults to "fsr_detection_output". +*@par Outputs: +*box: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*actual_bbox_num: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li totalnum < max_rois_num * batch_rois. +*@li "score" must be with shape (total_num, (num_classes+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images. +*@li "prior_box" must be with shape (total_num, (num_classes*4+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images. +*/ +REG_OP(FSRDetectionOutput) + .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(prior_box, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(score, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(actual_rois_num, TensorType({DT_INT32})) + .OUTPUT(actual_bbox_num, TensorType({DT_INT32})) + .OUTPUT(box, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(batch_rois, Int, 1024) + .ATTR(im_info, ListInt, {375,1024}) + .ATTR(num_classes, Int, 80) + .ATTR(max_rois_num, Int, 1024) + .ATTR(score_thresh, Float, 0.45) + .ATTR(nms_thresh, Float, 0.7) + .ATTR(bbox_reg_weights, ListInt, {1,1,1,1}) + .ATTR(post_nms_topn, Int, 304) + .OP_END_FACTORY_REG(FSRDetectionOutput) +} +#endif diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h index 94143ac1..2ac7a70e 100644 --- a/third_party/fwkacllib/inc/ops/image_ops.h +++ b/third_party/fwkacllib/inc/ops/image_ops.h @@ -21,26 +21,111 @@ namespace ge { +/** +*@brief Adjust the hue of one or more images. + +*@par Inputs: +*Input images is a tensor of at least 3 dimensions. The last dimension is \n +interpretted as channels, and must be three. Inputs include: \n +*@li images:A Tensor of type float. Images to adjust. At least 3-D. +*@li delta:A Tensor of type float. A float delta to add to the hue. + +*@par Outputs: +*y:A Tensor of type float. + +*@attention Constraints: \n +*Input images is a tensor of at least 3 dimensions. The last dimension is \n +interpretted as channels, and must be three. + +*/ + REG_OP(AdjustHue) - .INPUT(images, TensorType({DT_FLOAT})) + .INPUT(images, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(delta, TensorType({DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(AdjustHue) +/** +*@brief Adjust the saturation of one or more images. + +*@par Inputs: +*Input images is a tensor of at least 3 dimensions. The last dimension is \n +interpretted as channels, and must be three. Inputs include: \n +*@li images:A Tensor of type float. Images to adjust. At least 3-D. +*@li scale:A Tensor of type float. A float scale to add to the saturation. + +*@par Outputs: +*y:A Tensor of type float. + +*@attention Constraints: \n +*Input images is a tensor of at least 3 dimensions. The last dimension is \n +interpretted as channels, and must be three. + +*/ + REG_OP(AdjustSaturation) - .INPUT(images, TensorType({DT_FLOAT})) + .INPUT(images, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(scale, TensorType({DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(AdjustSaturation) +/** +*@brief Adjust the contrast of one or more images. + +*@par Inputs: +*Input images is a tensor of at least 3 dimensions. The last 3 dimensions are \n +interpreted as '[height, width, channels]'. Inputs include: \n +*@li images:A Tensor of type float. Images to adjust. At least 3-D. +*@li scale:A Tensor of type float. A float multiplier for adjusting contrast. + +*@par Outputs: +*y:A Tensor of type float. + +*@attention Constraints: \n +*Input images is a tensor of at least 3 dimensions. The last dimension is \n +interpretted as channels, and must be three. + +*/ + REG_OP(AdjustContrast) - .INPUT(images, TensorType({DT_FLOAT})) + .INPUT(images, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(contrast_factor, TensorType({DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(AdjustContrast) +/** +*@brief Extracts crops from the input image tensor and resizes them. Extracts \n +crops from the input image tensor and resizes them using bilinear sampling or \n +nearest neighbor sampling to a common output size specified by crop_size. + +*@par Inputs: +*Input images must be a 4-D tensor. Inputs include: \n +*@li images:A Tensor. Must be one of the following types:uint8, uint16, int8, \n +int16, int32, int64, float16, float, double. A 4-D tensor of shape \n +[batch, image_height, image_width, depth]. +*@li boxes: A Tensor of type float. A 2-D tensor of shape [num_boxes, 4]. +*@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with \n +int32 values in [0, batch). +*@li crop_size: A Tensor of type int32. A 1-D tensor of 2 elements, crop_size \n += [crop_height, crop_width]. All cropped image patches are resized to this size. + +*@par Attributes: +*@li extrapolation_value: An optional float. Defaults to 0. Value used for \n +extrapolation, when applicable. +*@li method: An optional string from: '"bilinear", "nearest"'. Defaults to \n +"bilinear". Currently two sampling methods are supported: Bilinear and \n +NearestNeighbor. + +*@par Outputs: +*y:A Tensor of type float. + +*@attention Constraints: \n +*Input images must be a 4-D tensor. + +*/ + REG_OP(CropAndResize) - .INPUT(images, TensorType({DT_UINT8, DT_UINT16, DT_INT8, \ + .INPUT(x, TensorType({DT_UINT8, DT_UINT16, DT_INT8, \ DT_INT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(boxes, TensorType({DT_FLOAT})) .INPUT(box_index, TensorType({DT_INT32})) @@ -50,6 +135,34 @@ REG_OP(CropAndResize) .ATTR(method, String, "bilinear") .OP_END_FACTORY_REG(CropAndResize) +/** +*@brief Computes the gradient of the crop_and_resize op wrt the input \n +boxes tensor. + +*@par Inputs: +*Input images and grads must be a 4-D tensor. Inputs include: \n +*@li grads: A 4-D tensor of shape [num_boxes, crop_height, crop_width, depth]. +*@li images: A 4-D tensor of shape [batch, image_height, image_width, depth]. \n +Both image_height and image_width need to be positive. +*@li boxes: A 2-D tensor of shape [num_boxes, 4]. The i-th row of the tensor \n +specifies the coordinates of a box in the box_ind[i] image and is specified in \n +normalized coordinates [y1, x1, y2, x2]. +*@li box_index: A 1-D tensor of shape [num_boxes] with int32 values in \n +[0, batch). The value of box_ind[i] specifies the image that the i-th box \n +refers to. + +*@par Attributes: +method: A string specifying the interpolation method. Only 'bilinear' is \n +supported for now. + +*@par Outputs: +*y:A 2-D tensor of shape [num_boxes, 4]. + +*@attention Constraints: \n +*Input images and grads must be a 4-D tensor. + +*/ + REG_OP(CropAndResizeGradBoxes) .INPUT(grads, TensorType({DT_FLOAT})) .INPUT(images, TensorType({DT_UINT8, DT_UINT16, DT_INT8, DT_INT16, \ @@ -60,6 +173,35 @@ REG_OP(CropAndResizeGradBoxes) .ATTR(method, String, "bilinear") .OP_END_FACTORY_REG(CropAndResizeGradBoxes) +/** +*@brief Computes the gradient of the crop_and_resize op wrt the input \n +images tensor. + +*@par Inputs: +*Input grads must be a 4-D tensor. Inputs include: \n +*@li grads: A 4-D tensor of shape [num_boxes, crop_height, crop_width, depth]. +*@li boxes: A 2-D tensor of shape [num_boxes, 4]. The i-th row of the tensor \n +specifies the coordinates of a box in the box_ind[i] image and is specified \n +in normalized coordinates [y1, x1, y2, x2]. +*@li box_index: A 1-D tensor of shape [num_boxes] with int32 values in \n +[0, batch). The value of box_ind[i] specifies the image that the i-th box \n +refers to. +*@li image_size: A 1-D tensor with value [batch, image_height, image_width, \n +depth] containing the original image size. Both image_height and image_width \n +need to be positive. + +*@par Attributes: +method: A string specifying the interpolation method. Only 'bilinear' is \n +supported for now. + +*@par Outputs: +*y:A 4-D tensor of shape [batch, image_height, image_width, depth]. + +*@attention Constraints: \n +*Input grads must be a 4-D tensor. + +*/ + REG_OP(CropAndResizeGradImage) .INPUT(grads, TensorType({DT_FLOAT})) .INPUT(boxes, TensorType({DT_FLOAT})) @@ -70,6 +212,39 @@ REG_OP(CropAndResizeGradImage) .REQUIRED_ATTR(T, Type) .OP_END_FACTORY_REG(CropAndResizeGradImage) +/** +*@brief Extracts a glimpse from the input tensor. + +*@par Inputs: +*Input x must be a 4-D tensor. Inputs include: \n +*@li x: A 4-D float tensor of shape [batch_size, height, width, channels]. +*@li size: A 1-D tensor of 2 elements containing the size of the glimpses to \n +extract. The glimpse height must be specified first, following by the glimpse \n +width. +*@li offsets: A 2-D integer tensor of shape [batch_size, 2] containing the y, \n +x locations of the center of each window. + +*@par Attributes: +*@li centered: indicates if the offset coordinates are centered relative to \n +the image, in which case the (0, 0) offset is relative to the center of the \n +input images. If false, the (0,0) offset corresponds to the upper left corner \n +of the input images. +*@li normalized: indicates if the offset coordinates are normalized. +*@li uniform_noise: indicates if the noise should be generated using a \n +uniform distribution or a Gaussian distribution. +*@li noise: indicates if the noise should uniform, gaussian, or zero. \n +The default is uniform which means the the noise type will be decided by \n +uniform_noise. + +*@par Outputs: +*y:A tensor representing the glimpses [batch_size, glimpse_height, \n +glimpse_width, channels]. + +*@attention Constraints: \n +*Input x must be a 4-D tensor. + +*/ + REG_OP(ExtractGlimpse) .INPUT(x, TensorType({DT_FLOAT})) .INPUT(size, TensorType({DT_INT32})) @@ -81,23 +256,87 @@ REG_OP(ExtractGlimpse) .ATTR(noise, String, "uniform") .OP_END_FACTORY_REG(ExtractGlimpse) +/** +*@brief Convert one or more images from HSV to RGB. + +*@par Inputs: +*Last dimension of input x must be size 3. Inputs include: \n +*images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3. + +*@par Outputs: +*y:images converted to RGB. + +*@attention Constraints: \n +*Last dimension of input x must be size 3. + +*/ + REG_OP(HSVToRGB) - .INPUT(images, TensorType({ DT_FLOAT, DT_DOUBLE })) - .OUTPUT(y, TensorType({ DT_FLOAT, DT_DOUBLE })) + .INPUT(images, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE})) .OP_END_FACTORY_REG(HSVToRGB) +/** +*@brief Resize quantized images to size using quantized bilinear interpolation. + +*@par Inputs: +*Input images must be a 4-D tensor. Inputs include: \n +*@li images: 4-D with shape [batch, height, width, channels]. +*@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new \n +size for the images. +*@li min: A Tensor of type float. +*@li max: A Tensor of type float. + +*@par Attributes: +*@li align_corners: An optional bool. Defaults to False. If true, the centers \n +of the 4 corner pixels of the input and output tensors are aligned, preserving \n +the values at the corner pixels. Defaults to false. +*@li half_pixel_centers: indicates if the offset coordinates are normalized. + +*@par Outputs: +*@li resized_images: 4-D with shape [batch, new_height, new_width, channels]. +*@li y_min: A Tensor of type float. +*@li y_max: A Tensor of type float. + +*@attention Constraints: \n +*Input images and output images must be quantized types. + +*/ + REG_OP(QuantizedResizeBilinear) - .INPUT(images, TensorType({ DT_FLOAT })) + .INPUT(images, TensorType({DT_QUINT8,DT_QINT32,DT_FLOAT})) .INPUT(size, TensorType({ DT_INT32 })) .INPUT(min, TensorType({ DT_FLOAT })) .INPUT(max, TensorType({ DT_FLOAT })) - .OUTPUT(resized_images, TensorType({ DT_FLOAT })) + .OUTPUT(resized_images, TensorType({DT_QUINT8,DT_QINT32,DT_FLOAT })) .OUTPUT(y_min, TensorType({ DT_FLOAT })) .OUTPUT(y_max, TensorType({ DT_FLOAT })) .ATTR(align_corners, Bool, false) .ATTR(half_pixel_centers, Bool, false) .OP_END_FACTORY_REG(QuantizedResizeBilinear) +/** +*@brief Resize images to size using area interpolation. + +*@par Inputs: +*Input images must be a 4-D tensor. Inputs include: \n +*@li images: 4-D with shape [batch, height, width, channels]. +*@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. \n +The new size for the images. + +*@par Attributes: +*align_corners: If true, the centers of the 4 corner pixels of the input and \n +output tensors are aligned, preserving the values at the corner pixels. \n +Defaults to false. + +*@par Outputs: +*y: 4-D with shape [batch, new_height, new_width, channels]. + +*@attention Constraints: \n +*Input images can be of different types but output images are always float. + +*/ + REG_OP(ResizeArea) .INPUT(images, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -106,6 +345,31 @@ REG_OP(ResizeArea) .ATTR(align_corners, Bool, false) .OP_END_FACTORY_REG(ResizeArea) +/** +*@brief Computes the gradient of bicubic interpolation. + +*@par Inputs: +*Input grads must be a 4-D tensor. Inputs include: \n +*@li grads: A Tensor of type float. 4-D with shape [batch, height, width, \n +channels]. +*@li original_image: A Tensor. Must be one of the following types: float, \n +double. 4-D with shape [batch, orig_height, orig_width, channels], The image \n +tensor that was resized. + +*@par Attributes: +*@li align_corners: An optional bool. Defaults to False. If true, the centers \n +of the 4 corner pixels of the input and grad tensors are aligned. Defaults to \n +false. +*@li half_pixel_centers: An optional bool. Defaults to False. + +*@par Outputs: +*y: A Tensor. Has the same type as original_image. + +*@attention Constraints: \n +*Input images can be of different types but output images are always float. + +*/ + REG_OP(ResizeBicubicGrad) .INPUT(grads, TensorType({DT_FLOAT})) .INPUT(original_image, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -114,6 +378,29 @@ REG_OP(ResizeBicubicGrad) .ATTR(half_pixel_centers, Bool, false) .OP_END_FACTORY_REG(ResizeBicubicGrad) +/** +*@brief Resize images to size using bicubic interpolation. + +*@par Inputs: +*Input images must be a 4-D tensor. Inputs include: \n +*@li images: 4-D with shape [batch, height, width, channels]. +*@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new \n +size for the images. + +*@par Attributes: +*@li align_corners: If true, the centers of the 4 corner pixels of the input \n +and output tensors are aligned, preserving the values at the corner pixels. \n +Defaults to false. +*@li half_pixel_centers: An optional bool. Defaults to False. + +*@par Outputs: +*y: 4-D with shape [batch, new_height, new_width, channels]. + +*@attention Constraints: \n +*Input images can be of different types but output images are always float. + +*/ + REG_OP(ResizeBicubic) .INPUT(images, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -124,25 +411,29 @@ REG_OP(ResizeBicubic) .OP_END_FACTORY_REG(ResizeBicubic) /** -*@brief Performs the backpropagation of ResizeNearestNeighbor for training scenarios. +*@brief Computes the gradient of nearest neighbor interpolation. *@par Inputs: -* Two inputs, including: -*@li grads: A 4D Tensor, specifying the backpropagation gradients. Must be one of the following types: int8, uint8, int16, uint16, int32, int64, float16, float32, float64. -*@li size: A 1D Tensor of type int32, specifying the source image size (orig_height, orig_width). - -*@par Attributes: \n -*align_corners: An optional bool. If "True", the centers of the corner pixels of the input and gradient tensors are aligned. Defaults to "False". +*Input grads must be a 4-D tensor. Inputs include: \n +*@li grads: A Tensor. Must be one of the following types: uint8, int8, int32, \n +float16, float, double. 4-D with shape [batch, height, width, channels]. +*@li size: A 1-D int32 Tensor of 2 elements: orig_height, orig_width. \n +The original input size. -*@par Outputs: \n -*y: A 4D Tensor, specifying the backpropagation gradient after computation. Has the same type as "grads". +*@par Attributes: +*@li align_corners: An optional bool. Defaults to False. If true, the centers \n +of the 4 corner pixels of the input and grad tensors are aligned. Defaults to \n +false. +*@li half_pixel_centers: An optional bool. Defaults to False. -*@attention Constraints: -* When the inputs are of type float32, the execution performance is high. +*@par Outputs: +*y: A Tensor. Has the same type as grads. -*@see ResizeNearestNeighbor +*@attention Constraints: \n +*Input grads must be a 4-D tensor. */ -REG_OP(ResizeNearestNeighborGrad) + +REG_OP(ResizeNearestNeighborV2Grad) .INPUT(grads, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(size, TensorType({DT_INT32})) @@ -150,70 +441,159 @@ REG_OP(ResizeNearestNeighborGrad) DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .ATTR(align_corners, Bool, false) .ATTR(half_pixel_centers, Bool, false) - .OP_END_FACTORY_REG(ResizeNearestNeighborGrad) + .OP_END_FACTORY_REG(ResizeNearestNeighborV2Grad) + +/** +*@brief Computes the gradient of nearest neighbor interpolation. + +*@par Inputs: +*Input grads must be a 4-D tensor. Inputs include: \n +*grads: A Tensor. 4-D with shape [batch, height, width, channels]. + -REG_OP(ResizeNearestNeighborGradD) +*@par Attributes: +*@li align_corners: An optional bool. Defaults to False. If true, the centers \n +of the 4 corner pixels of the input and grad tensors are aligned. Defaults to \n +false. +*@li size: An list type. Specify the images size. + +*@par Outputs: +*y: A Tensor. Has the same type as grads. + +*/ + +REG_OP(ResizeNearestNeighborV2GradD) .INPUT(grads, TensorType({DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT})) .REQUIRED_ATTR(size, ListInt) .ATTR(align_corners, Bool, false) - .OP_END_FACTORY_REG(ResizeNearestNeighborGradD) + .ATTR(half_pixel_centers, Bool, false) + .OP_END_FACTORY_REG(ResizeNearestNeighborV2GradD) /** -*@brief Performs the backpropagation of ResizeBilinear, which is used to resize an image\n to a specified size, while this operator is used to restore the resized image to the original image. -*@par Inputs: -* Two inputs, including: -* @li grads: A float32 input in NC1HWC0 format, describing the image information after resizing,\n including the image height, width, number of channels, and number of images. -* @li original_image: A float32 input in NC1HWC0 format, describing the image information before resizing,\n including the image height, width, number of channels, and number of images. +*@brief Computes the gradient of bilinear interpolation. +*@par Inputs: +*Input grads must be a 4-D tensor. Inputs include: \n +*@li grads: A Tensor of type float32. 4-D with shape [batch, height, width, \n +channels]. +*@li original_image: A Tensor. 4-D with shape [batch, orig_height, orig_width, \n +channels], The image tensor that was resized. *@par Attributes: -*align_corners: An optional bool. If "True", the centers of the corner pixels of the input and\n gradient tensors are aligned. Defaults to "False". +*align_corners: An optional bool. Defaults to False. If true, the centers of \n +the 4 corner pixels of the input and grad tensors are aligned. Defaults to \n +false. *@par Outputs: -*y: A float32 output in NC1HWC0 format, specifying the image information before resizing, including the image height,\n -width, number of channels, and number of images. +*y: A Tensor. Has the same type as original_image. + +*@attention Constraints: \n +*Input grads must be a 4-D tensor. */ -REG_OP(ResizeBilinearGrad) + +REG_OP(ResizeBilinearV2Grad) .INPUT(grads, TensorType({DT_FLOAT})) .INPUT(original_image, TensorType::FloatingDataType()) .OUTPUT(y, TensorType({DT_FLOAT})) .ATTR(align_corners, Bool, false) - .OP_END_FACTORY_REG(ResizeBilinearGrad) + .ATTR(half_pixel_centers, Bool, false) + .OP_END_FACTORY_REG(ResizeBilinearV2Grad) /** -*@brief Resizes "images" to "size" using bilinear interpolation. +*@brief Resize images to size using bilinear interpolation. *@par Inputs: -* Two inputs, including: -*@li images: An NC1HWC0 Tensor. -* Must be one of the following types: int8, uint8, int16, uint16, int32, int64, float16, float32, double -*@li size: An ND Tensor of type int32. +*Input images must be a 4-D tensor. Inputs include: \n +*@li x: 4-D with shape [batch, height, width, channels]. +*@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new \n +size for the images. *@par Attributes: -*align_corners: An optional bool. If "true", the centers of the corner pixels of the input and output tensors are aligned. Defaults to "false". +*align_corners: If true, the centers of the 4 corner pixels of the input and \n +output tensors are aligned, preserving the values at the corner pixels. \n +Defaults to false. *@par Outputs: -*y: A Tensor with the same format as input "images". +*y: 4-D with shape [batch, new_height, new_width, channels]. + +*@attention Constraints: \n +*Input images can be of different types but output images are always float. */ -REG_OP(ResizeBilinear) - .INPUT(images, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, + +REG_OP(ResizeBilinearV2) + .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(size, TensorType({DT_INT32})) .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .ATTR(align_corners, Bool, false) - .OP_END_FACTORY_REG(ResizeBilinear) + .ATTR(half_pixel_centers, Bool, false) + .OP_END_FACTORY_REG(ResizeBilinearV2) + +/** +*@brief Converts one or more images from RGB to HSV. + +*@par Inputs: +*Last dimension of input images must be size 3. Inputs include: \n +*images: A Tensor. Must be one of the following types: float, double. 1-D or \n +higher rank. RGB data to convert. Last dimension must be size 3. + +*@par Outputs: +*y: A Tensor. Has the same type as images. + +*@attention Constraints: \n +*Outputs a tensor of the same shape as the images tensor, containing the HSV \n +value of the pixels. The output is only well defined if the value in images \n +are in [0,1]. + +*/ REG_OP(RGBToHSV) - .INPUT(images, TensorType({ DT_FLOAT, DT_DOUBLE })) - .OUTPUT(y, TensorType({ DT_FLOAT, DT_DOUBLE })) + .INPUT(images, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) .OP_END_FACTORY_REG(RGBToHSV) +/** +*@brief Generate a single randomly distorted bounding box for an image. + +*@par Inputs: +*Input images must be a 4-D tensor. Inputs include: \n +*@li image_size: 1-D, containing [height, width, channels]. +*@li bounding_boxes: 3-D with shape [batch, N, 4] describing the N bounding \n +boxes associated with the image. +*@li min_object_covered: The cropped area of the image must contain at least \n +this fraction of any bounding box supplied. The value of this parameter should \n +be non-negative. In the case of 0, the cropped area does not need to overlap \n +any of the bounding boxes supplied. + +*@par Attributes: +*@li seed: If either seed or seed2 are set to non-zero, the random number \n +generator is seeded by the given seed. Otherwise, it is seeded by a random seed. +*@li seed2: A second seed to avoid seed collision. +*@li aspect_ratio_range: The cropped area of the image must have an aspect \n +ratio = width / height within this range. +*@li max_attempts: Number of attempts at generating a cropped region of the \n +image of the specified constraints. After max_attempts failures, return the \n +entire image. +*@li use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes \n +supplied. If true, assume an implicit bounding box covering the whole input. \n +If false, raise an error. + +*@par Outputs: +*@li begin: 1-D, containing [offset_height, offset_width, 0]. +*@li size: 1-D, containing [target_height, target_width, -1]. +*@li bboxes: 3-D with shape [1, 1, 4] containing the distorted bounding box. + +*@attention Constraints: \n +*Input images can be of different types but output images are always float. + +*/ + REG_OP(SampleDistortedBoundingBoxExt2) .INPUT(image_size, TensorType({ DT_UINT8, DT_INT8, DT_INT16, \ DT_INT32, DT_INT64 })) - .INPUT(bounding_boxes, TensorType({ DT_FLAOT })) + .INPUT(bounding_boxes, TensorType({ DT_FLOAT })) .INPUT(min_object_covered, TensorType({ DT_FLOAT })) .OUTPUT(begin, TensorType({ DT_UINT8, DT_INT8, DT_INT16, \ DT_INT32, DT_INT64 })) @@ -229,28 +609,50 @@ REG_OP(SampleDistortedBoundingBoxExt2) .OP_END_FACTORY_REG(SampleDistortedBoundingBoxExt2) /** -*@brief Resizes "images" to "size" using nearest neighbor interpolation. +*@brief Resize images to size using nearest neighbor interpolation. *@par Inputs: -* Two inputs, including: -*@li images: An NC1HWC0 Tensor. -* Must be one of the following types: int8, uint8, int16, uint16, int32, int64, float16, float32, double -*@li size: An ND Tensor of type int32. +*Input x must be a 4-D tensor. Inputs include: \n +*@li x: 4-D with shape [batch, height, width, channels]. +*@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. \n +The new size for the images. *@par Attributes: -*align_corners: An optional bool. If "true", the centers of the corner pixels of the input and output tensors are aligned. Defaults to "false". +*align_corners: If true, the centers of the 4 corner pixels of the input and \n +output tensors are aligned, preserving the values at the corner pixels. \n +Defaults to false. *@par Outputs: -*y: A Tensor with the same type and format as input "images". +*y: 4-D with shape [batch, new_height, new_width, channels]. */ -REG_OP(ResizeNearestNeighbor) - .INPUT(images, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, + +REG_OP(ResizeNearestNeighborV2) + .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(size, TensorType({DT_INT32})) .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .ATTR(align_corners, Bool, false) - .OP_END_FACTORY_REG(ResizeNearestNeighbor) + .ATTR(half_pixel_centers, Bool, false) + .OP_END_FACTORY_REG(ResizeNearestNeighborV2) + +/** +*@brief Draw bounding boxes on a batch of images. + +*@par Inputs: +*Input images must be a 4-D tensor. Inputs include: \n +*@li images: A Tensor. Must be one of the following types: float. 4-D with \n +shape [batch, height, width, depth]. A batch of images. +*@li boxes: A Tensor of type float32. 3-D with shape [batch, \n +num_bounding_boxes, 4] containing bounding boxes. + +*@par Outputs: +*A Tensor. Has the same type as images. + +*@attention Constraints: \n +*Input images must be a 4-D tensor. + +*/ REG_OP(DrawBoundingBoxes) .INPUT(images, TensorType({DT_FLOAT})) @@ -258,6 +660,31 @@ REG_OP(DrawBoundingBoxes) .OUTPUT(y, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(DrawBoundingBoxes) +/** +*@brief Greedily selects a subset of bounding boxes in descending order of \n +score. + +*@par Inputs: +*Input boxes and scores must be float type. Inputs include: \n +*@li boxes: A 2-D float tensor of shape [num_boxes, 4]. +*@li scores: A 1-D float tensor of shape [num_boxes] representing a single \n +score corresponding to each box (each row of boxes). +*@li max_output_size: A scalar integer tensor representing the maximum number \n +of boxes to be selected by non max suppression. + +*@par Attributes: +*iou_threshold: A float representing the threshold for deciding whether boxes \n +overlap too much with respect to IOU. + +*@par Outputs: +*selected_indices: A 1-D integer tensor of shape [M] representing the selected \n +indices from the boxes tensor, where M <= max_output_size. + +*@attention Constraints: \n +*Input boxes and scores must be float type. + +*/ + REG_OP(NonMaxSuppression) .INPUT(boxes, TensorType({DT_FLOAT})) .INPUT(scores, TensorType({DT_FLOAT})) @@ -266,34 +693,140 @@ REG_OP(NonMaxSuppression) .ATTR(iou_threshold, Float, 0.5f) .OP_END_FACTORY_REG(NonMaxSuppression) +/** +*@brief Greedily selects a subset of bounding boxes in descending order of \n +score. + +*@par Inputs: +*Input boxes and scores must be float type. Inputs include: \n +*@li boxes: A 2-D float tensor of shape [num_boxes, 4]. +*@li scores: A 1-D float tensor of shape [num_boxes] representing a single \n +score corresponding to each box (each row of boxes). +*@li max_output_size: A scalar integer tensor representing the maximum number \n +of boxes to be selected by non max suppression. +*@li iou_threshold: A 0-D float tensor representing the threshold for deciding \n +whether boxes overlap too much with respect to IOU. + +*@par Outputs: +*selected_indices: A 1-D integer tensor of shape [M] representing the selected \n +indices from the boxes tensor, where M <= max_output_size. + +*@attention Constraints: \n +*Input boxes and scores must be float type. + +*/ + REG_OP(NonMaxSuppressionV2) .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(scores, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(max_output_size, TensorType({DT_INT32})) - .INPUT(iou_threshold, TensorType({DT_FLOAT})) + .INPUT(iou_threshold, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(selected_indices, TensorType({DT_INT32})) .OP_END_FACTORY_REG(NonMaxSuppressionV2) +/** +*@brief Greedily selects a subset of bounding boxes in descending order of \n +score. + +*@par Inputs: +*Input boxes and scores must be float type. Inputs include: \n +*@li boxes: A 2-D float tensor of shape [num_boxes, 4]. +*@li scores: A 1-D float tensor of shape [num_boxes] representing a single \n +score corresponding to each box (each row of boxes). +*@li max_output_size: A scalar integer tensor representing the maximum number \n +of boxes to be selected by non max suppression. +*@li iou_threshold: A 0-D float tensor representing the threshold for deciding \n +whether boxes overlap too much with respect to IOU. +*@li score_threshold: A 0-D float tensor representing the threshold for \n +deciding when to remove boxes based on score. + +*@par Outputs: +*selected_indices: A 1-D integer tensor of shape [M] representing the selected \n +indices from the boxes tensor, where M <= max_output_size. + +*@attention Constraints: \n +*Input boxes and scores must be float type. + +*/ + REG_OP(NonMaxSuppressionV3) .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(scores, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(max_output_size, TensorType({DT_INT32})) - .INPUT(iou_threshold, TensorType({DT_FLOAT})) - .INPUT(score_threshold, TensorType({DT_FLOAT})) + .INPUT(iou_threshold, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(score_threshold, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(selected_indices, TensorType({DT_INT32})) .OP_END_FACTORY_REG(NonMaxSuppressionV3) +/** +*@brief Greedily selects a subset of bounding boxes in descending order of \n +score. + +*@par Inputs: +*Input boxes and scores must be float type. Inputs include: \n +*@li boxes: A 2-D float tensor of shape [num_boxes, 4]. +*@li scores: A 1-D float tensor of shape [num_boxes] representing a single \n +score corresponding to each box (each row of boxes). +*@li max_output_size: A scalar integer tensor representing the maximum number \n +of boxes to be selected by non max suppression. +*@li iou_threshold: A 0-D float tensor representing the threshold for deciding \n +whether boxes overlap too much with respect to IOU. +*@li score_threshold: A 0-D float tensor representing the threshold for \n +deciding when to remove boxes based on score. + +*@par Attributes: +*pad_to_max_output_size: If true, the output selected_indices is padded \n +to be of length max_output_size. Defaults to false. + +*@par Outputs: +*@li selected_indices: A 1-D integer tensor of shape [M] representing the \n +selected indices from the boxes tensor, where M <= max_output_size. +*@li valid_outputs: A 0-D integer tensor representing the number of valid \n +elements in selected_indices, with the valid elements appearing first. + +*@attention Constraints: \n +*Input boxes and scores must be float type. + +*/ + REG_OP(NonMaxSuppressionV4) .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(scores, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(max_output_size, TensorType({DT_INT32})) - .INPUT(iou_threshold, TensorType({DT_FLOAT})) - .INPUT(score_threshold, TensorType({DT_FLOAT})) + .INPUT(iou_threshold, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(score_threshold, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(selected_indices, TensorType({DT_INT32})) .OUTPUT(valid_outputs, TensorType({DT_INT32})) .ATTR(pad_to_max_output_size, Bool, false) .OP_END_FACTORY_REG(NonMaxSuppressionV4) +/** +*@brief Greedily selects a subset of bounding boxes in descending order of \n +score. + +*@par Inputs: +*Input overlaps and scores must be float type. Inputs include: \n +*@li overlaps: A 2-D float tensor of shape [num_boxes, num_boxes] \n +representing the n-by-n box overlap values. +*@li scores: A 1-D float tensor of shape [num_boxes] representing a single \n +score corresponding to each box (each row of boxes). +*@li max_output_size: A scalar integer tensor representing the maximum number \n +of boxes to be selected by non max suppression. +*@li overlap_threshold: A 0-D float tensor representing the threshold for \n +deciding whether boxes overlap too. +*@li score_threshold: A 0-D float tensor representing the threshold for \n +deciding when to remove boxes based on score. + +*@par Attributes: +*pad_to_max_output_size: If true, the output selected_indices is padded \n +to be of length max_output_size. Defaults to false. + +*@par Outputs: +*selected_indices: A 1-D integer tensor of shape [M] representing the \n +selected indices from the boxes tensor, where M <= max_output_size. + +*/ + REG_OP(NonMaxSuppressionWithOverlaps) .INPUT(overlaps, TensorType({DT_FLOAT})) .INPUT(scores, TensorType({DT_FLOAT})) @@ -303,6 +836,33 @@ REG_OP(NonMaxSuppressionWithOverlaps) .OUTPUT(selected_indices, TensorType({DT_INT32})) .OP_END_FACTORY_REG(NonMaxSuppressionWithOverlaps) +/** +*@brief JPEG-encode an image. + +*@par Inputs: +*Input image must be unit8 type. Inputs include: \n +*image: A 3-D uint8 Tensor of shape [height, width, channels]. + +*@par Attributes: +*@li format: Per pixel image format. +*@li quality: Quality of the compression from 0 to 100 (higher is better \n +and slower). +*@li progressive: If True, create a JPEG that loads progressively (coarse \n +to fine). +*@li optimize_size: If True, spend CPU/RAM to reduce size with no quality \n +change. +*@li chroma_downsampling: A boolean, default is true. +*@li density_unit: Unit used to specify x_density and y_density: pixels per \n +inch ('in') or centimeter ('cm'). +*@li x_density: Horizontal pixels per density unit. +*@li y_density: Vertical pixels per density unit. +*@li xmp_metadata: If not empty, embed this XMP metadata in the image header. + +*@par Outputs: +*contents: 0-D. JPEG-encoded image. + +*/ + REG_OP(EncodeJpeg) .INPUT(image, TensorType({DT_UINT8})) .OUTPUT(contents, TensorType({DT_STRING})) @@ -317,6 +877,23 @@ REG_OP(EncodeJpeg) .ATTR(xmp_metadata, String, "") .OP_END_FACTORY_REG(EncodeJpeg) +/** +*@brief PNG-encode an image. + +*@par Inputs: +*Input image must be unit8 or uint16 type. Inputs include: \n +*image: is a 3-D uint8 or uint16 Tensor of shape [height, width, channels] \n +where channels is: 1: for grayscale; 2: for grayscale + alpha; 3: for RGB; \n +4: for RGBA. + +*@par Attributes: +*compression: Compression level. + +*@par Outputs: +*contents: 0-D. PNG-encoded image. + +*/ + REG_OP(EncodePng) .INPUT(image, TensorType({DT_UINT8, DT_UINT16})) .OUTPUT(contents, TensorType({DT_STRING})) @@ -328,57 +905,245 @@ REG_OP(EncodePng) *@par Inputs: * One input: -*images: An NC1HWC0 Tensor. \n +*x: An NC1HWC0 Tensor. \n * Must be one of the following types: float16, float32. *@par Attributes: -*@li size: A required int32 Tensor specifying the new size for the images. No default value. -*@li align_corners: An optional bool. If "true", the centers of the corner pixels of the input and output tensors are aligned. Defaults to "false". +*@li size: A required int32 Tensor specifying the new size for the images. \n +No default value. +*@li align_corners: An optional bool. If "true", the centers of the corner \n +pixels of the input and output tensors are aligned. Defaults to "false". *@par Outputs: *y: A Tensor with type float32 and the same format as input "images". *@attention Constraints: -*@li The input "size" must be a tensor of 2 elements: size[0] <= 2048, size[1] <= 2048. -*@li The input "images" must be a tensor of 5 elements: images[2] <= 2048, images[3] <= 2048. +*@li The input "size" must be a tensor of 2 elements: size[0] <= 2048, \n +size[1] <= 2048. +*@li The input "images" must be a tensor of 5 elements: images[2] <= 2048, \n +images[3] <= 2048. */ -REG_OP(ResizeBilinearD) - .INPUT(images, TensorType({DT_FLOAT16, DT_FLOAT})) +REG_OP(ResizeBilinearV2D) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) .ATTR(align_corners, Bool, false) + .ATTR(half_pixel_centers, Bool, false) .REQUIRED_ATTR(size, ListInt) - .OP_END_FACTORY_REG(ResizeBilinearD) + .OP_END_FACTORY_REG(ResizeBilinearV2D) /** *@brief Resizes "images" to "size" using nearest neighbor interpolation. *@par Inputs: * One input: -*images: An NC1HWC0 Tensor. \n +*x: An NC1HWC0 Tensor. \n * Must be one of the following types: float16, float32, int32, int8, uint8 *@par Attributes: -*@li size: A required int32 Tensor specifying the new size for the images. No default value. -*@li align_corners: An optional bool. If "true", the centers of the corner pixels of the input and output tensors are aligned. Defaults to "false". +*@li size: A required int32 Tensor specifying the new size for the images. \n +No default value. +*@li align_corners: An optional bool. If "true", the centers of the corner \n +pixels of the input and output tensors are aligned. Defaults to "false". *@par Outputs: *y: A Tensor with the same type and format as input "images". *@attention Constraints: -* The input "size" must be a tensor of 2 elements: size[0] <= 7680, size[1] <= 4320 +* The input "size" must be a tensor of 2 elements: size[0] <= 7680, \n +size[1] <= 4320 */ -REG_OP(ResizeNearestNeighborD) - .INPUT(images, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) +REG_OP(ResizeNearestNeighborV2D) + .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .REQUIRED_ATTR(size, ListInt) .ATTR(align_corners, Bool, false) - .OP_END_FACTORY_REG(ResizeNearestNeighborD) + .ATTR(half_pixel_centers, Bool, false) + .OP_END_FACTORY_REG(ResizeNearestNeighborV2D) + +/** +*@brief Extract the shape information of a JPEG-encoded image. + +*@par Inputs: +*Input contents must be 0-D. Inputs include: \n +*contents: 0-D. The JPEG-encoded image. + +*@par Attributes: +*output_type: The output type of the operation (int32 or int64). Defaults \n +to int32. + +*@par Outputs: +*image_shape: 1-D. The image shape with format [height, width, channels]. +*/ REG_OP(ExtractJpegShape) .INPUT(contents, TensorType({DT_STRING})) .OUTPUT(image_shape, TensorType({DT_INT32, DT_INT64})) .REQUIRED_ATTR(output_type, Type) .OP_END_FACTORY_REG(ExtractJpegShape) + +/** +*@brief Draw bounding boxes on a batch of images. + +*@par Inputs: +*@li images: 4-D with shape `[batch, height, width, depth]`. \n +A batch of images. +*@li boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` \n +containing bounding boxes. +*@li colors: 2-D. A list of RGBA colors to cycle through for the boxes. + +*@par Outputs: +*y: Returns 4-D with the same shape as `images`. \n +The batch of input images with bounding boxes drawn on the images. +*/ + +REG_OP(DrawBoundingBoxesV2) + .INPUT(images, TensorType({DT_FLOAT})) + .INPUT(boxes, TensorType({DT_FLOAT})) + .INPUT(colors, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .OP_END_FACTORY_REG(DrawBoundingBoxesV2) + +/** +*@brief Greedily selects a subset of bounding boxes in descending order of score, \n +pruning away boxes that have high intersection-over-union (IOU) overlap \n +with previously selected boxes. + +*@par Inputs: +*@li boxes: A 2-D float tensor of shape `[num_boxes, 4]`. +*@li scores: A 1-D float tensor of shape `[num_boxes]` representing a single \n +score corresponding to each box (each row of boxes). +*@li max_output_size: A scalar integer tensor representing the maximum number of \n +boxes to be selected by non max suppression. +*@li iou_threshold: A 0-D float tensor representing the threshold for deciding whether \n +boxes overlap too much with respect to IOU. +*@li score_threshold: A 0-D float tensor representing the threshold for deciding when to \n +remove boxes based on score. +*@li soft_nms_sigma: A 0-D float tensor representing the sigma parameter for Soft NMS. + +*@par Attributes: +pad_to_max_output_size: If true, the output `selected_indices` is padded to be of length \n +`max_output_size`. Defaults to false. If not specified, defaults to false. + +*@par Outputs: +*@li selected_indices: A 1-D integer tensor of shape [M] representing the \n +selected indices from the boxes tensor, where M <= max_output_size. +*@li selected_scores: A 1-D float tensor of shape `[M]` representing the corresponding \n +scores for each selected box, where `M <= max_output_size`. +*@li valid_outputs: A 0-D integer tensor representing the number of valid \n +elements in selected_indices, with the valid elements appearing first. +*/ + +REG_OP(NonMaxSuppressionV5) + .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(scores, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(max_output_size, TensorType({DT_INT32})) + .INPUT(iou_threshold, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(score_threshold, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(soft_nms_sigma, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(selected_indices, TensorType({DT_INT32})) + .OUTPUT(selected_scores, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(valid_outputs, TensorType({DT_INT32})) + .ATTR(pad_to_max_output_size, Bool, false) + .REQUIRED_ATTR(T, Type) + .OP_END_FACTORY_REG(NonMaxSuppressionV5) + +/** +*@brief Resizes "images" to "size" by scale and translate. + +*@par Inputs: +*@li images: A `Tensor`. Must be one of the following types: `int8`, `uint8`, \n +`int16`, `uint16`, `int32`, `int64`, `bfloat16`, `half`, `float32`, `float64`. +*@li size: A `Tensor` of type `int32`. +*@li scale: A `Tensor` of type `float32`. +*@li translation: A `Tensor` of type `float32`. + +*@par Outputs: +*y: A Tensor with type float32. +*/ + +REG_OP(ScaleAndTranslate) + .INPUT(images, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(size, TensorType({DT_INT32})) + .INPUT(scale, TensorType({DT_FLOAT})) + .INPUT(translation, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .ATTR(kernel_type, String, "lanczos3") + .ATTR(antialias, Bool, true) + .OP_END_FACTORY_REG(ScaleAndTranslate) + +/** +*@brief Computes the gradient by scale and translate. + +*@par Inputs: +*@li grads: A `Tensor`. Must be one of the following types: `float32`. +*@li original_image: A `Tensor`. Must have the same type as `grads`. +*@li scale: A `Tensor` of type `float32`. +*@li translation: A `Tensor` of type `float32`. + +*@par Outputs: +*y: A `Tensor`. Has the same type as `grads`. +*/ + +REG_OP(ScaleAndTranslateGrad) + .INPUT(grads, TensorType({DT_FLOAT})) + .INPUT(original_image, TensorType({DT_FLOAT})) + .INPUT(scale, TensorType({DT_FLOAT})) + .INPUT(translation, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .ATTR(kernel_type, String, "lanczos3") + .ATTR(antialias, Bool, true) + .OP_END_FACTORY_REG(ScaleAndTranslateGrad) + +/** +*@brief Greedily selects a subset of bounding boxes in descending order of score, \n +This operation performs non_max_suppression on the inputs per batch, across all classes. + +*@par Inputs: +*@li boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then \n +same boxes are used for all classes otherwise, if `q` is equal to number of \n +classes, class-specific boxes are used. +*@li scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]` \n +representing a single score corresponding to each box (each row of boxes). +*@li max_output_size_per_class: A scalar integer tensor representing the maximum number of \n +boxes to be selected by non max suppression per class. +*@li max_total_size: A scalar representing maximum number of boxes retained over all classes. \n +*@li iou_threshold: A 0-D float tensor representing the threshold for deciding whether \n +boxes overlap too much with respect to IOU. +*@li score_threshold: A 0-D float tensor representing the threshold for deciding when to remove \n +boxes based on score. + +*@par Attributes: +*@li pad_per_class: If false, the output nmsed boxes, scores and classes \n +are padded/clipped to `max_total_size`. If true, the \n +output nmsed boxes, scores and classes are padded to be of length \n +`max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in \n +which case it is clipped to `max_total_size`. Defaults to false. +*@li clip_boxes: If true, assume the box coordinates are between [0, 1] and clip the output boxes \n +if they fall beyond [0, 1]. If false, do not do clipping and output the box \n +coordinates as it is. If not specified, defaults to true. + +*@par Outputs: +*y: A 1-D integer tensor of shape `[M]` representing the selected \n +indices from the boxes tensor, where `M <= max_output_size`. + +*/ + +REG_OP(CombinedNonMaxSuppression) + .INPUT(boxes, TensorType({DT_FLOAT})) + .INPUT(scores, TensorType({DT_FLOAT})) + .INPUT(max_output_size_per_class, TensorType({DT_INT32})) + .INPUT(max_total_size, TensorType({DT_INT32})) + .INPUT(iou_threshold, TensorType({DT_FLOAT})) + .INPUT(score_threshold, TensorType({DT_FLOAT})) + .OUTPUT(nmsed_boxes, TensorType({DT_FLOAT})) + .OUTPUT(nmsed_scores, TensorType({DT_FLOAT})) + .OUTPUT(nmsed_classes, TensorType({DT_FLOAT})) + .OUTPUT(valid_detections, TensorType({DT_INT32})) + .ATTR(pad_per_class, Bool, false) + .ATTR(clip_boxes, Bool, true) + .OP_END_FACTORY_REG(CombinedNonMaxSuppression) + } // namespace ge #endif // GE_OP_MAGE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/linalg_ops.h b/third_party/fwkacllib/inc/ops/linalg_ops.h index 320dfb0e..b8a12950 100644 --- a/third_party/fwkacllib/inc/ops/linalg_ops.h +++ b/third_party/fwkacllib/inc/ops/linalg_ops.h @@ -22,34 +22,156 @@ namespace ge { +/** +*@brief Computes the reverse mode backpropagated gradient of the Cholesky \n +algorithm. + +*@par Inputs: +*The input x has to be symmetric and positive definite. Inputs include: \n +*@li x:A Tensor. Must be one of the following types: double, float32. Output \n +of batch Cholesky algorithm x = cholesky(A). Shape is [..., M, M]. Algorithm \n +depends only on lower triangular part of the innermost matrices of this tensor. +*@li grad:A Tensor. Must have the same type as l. df/dx where f is some \n +scalar function. Shape is [..., M, M]. Algorithm depends only on lower \n +triangular part of the innermost matrices of this tensor. + +*@par Outputs: +*y:A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The input x is a tensor of shape [..., M, M] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(CholeskyGrad) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(grad, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(CholeskyGrad) +/** +*@brief Computes the Cholesky decomposition of one or more square matrices. + +*@par Inputs: +*The input x has to be symmetric and positive definite.Inputs include: \n +*x:A Tensor. Must be one of the following types: double, float32. Shape \n +is [..., M, M]. + +*@par Outputs: +*y:A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The input x is a tensor of shape [..., M, M] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(Cholesky) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(Cholesky) +/** +*@brief Computes the sign and the log of the absolute value of the determinant \n +of one or more square matrices. + +*@par Inputs: +*The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions \n +form square matrices. Inputs include: \n +*x:A Tensor. Must be one of the following types: double, float32. Shape is \n +[..., M, M]. + +*@par Outputs: +*@li y:A Tensor. Has the same type as x. +*@li sign:A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(LogMatrixDeterminant) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(sign, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(LogMatrixDeterminant) +/** +*@brief Computes the determinant of one or more square matrices. + +*@par Inputs: +*The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions \n +form square matrices. Inputs include: \n +*x:A Tensor. Must be one of the following types: double, float32. Shape is \n +[..., M, M]. + +*@par Outputs: +*y:A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The input x is a tensor of shape [..., M, M] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(MatrixDeterminant) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(MatrixDeterminant) +/** +*@brief Computes the inverse of one or more square invertible matrices or \n +their adjoints (conjugate transposes). + +*@par Inputs: +*The input x is a tensor of shape [..., M, M] whose inner-most 2 dimensions \n +form square matrices. Inputs include: \n +*x:A Tensor. Must be one of the following types: double, float. Shape is \n +[..., M, M]. + +*@par Attributes: +*adjoint:An optional bool. Defaults to False.Boolean indicating whether to \n +deal with matrix or its (block-wise) adjoint. + +*@par Outputs: +*y:A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The input x is a tensor of shape [..., M, M] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(MatrixInverse) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .ATTR(adjoint, Bool, false) .OP_END_FACTORY_REG(MatrixInverse) +/** +*@brief Solves systems of linear equations. + +*@par Inputs: +*The input rhs must have the same type as matrix. Inputs include: \n +*@li matrix:A Tensor. Must be one of the following types: double, float. \n +Shape is [..., M, M]. +*@li rhs:A Tensor. Must have the same type as matrix. Shape is [..., M, K]. + +*@par Attributes: +*adjoint:An optional bool. Defaults to False.Boolean indicating whether to \n +solve with matrix or its (block-wise) adjoint. + +*@par Outputs: +*y:A Tensor. Has the same type as matrix. + +*@attention Constraints: \n +*The input matrix is a tensor of shape [..., M, M] whose inner-most 2 \n +dimensions form square matrices. \n + +*/ + REG_OP(MatrixSolve) .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -57,6 +179,29 @@ REG_OP(MatrixSolve) .ATTR(adjoint, Bool, false) .OP_END_FACTORY_REG(MatrixSolve) +/** +*@brief Solves systems of linear equations. + +*@par Inputs: +*The input rhs must have the same type as matrix. Inputs include: \n +*@li matrix:A Tensor. Shape is [..., M, M]. +*@li rhs:A Tensor. Must have the same type as matrix. Shape is [..., M, K]. +*@li l2:0-D double Tensor. Ignored if fast=False. + +*@par Attributes: +*fast:bool. Defaults to True. + +*@par Outputs: +*y:Tensor of shape [..., N, K] whose inner-most 2 dimensions form M-by-K \n +matrices that solve the equations matrix[..., :, :] * output[..., :, :] = \n +rhs[..., :, :] in the least squares sense. + +*@attention Constraints: \n +*The input matrix matrix is a tensor of shape [..., M, M] whose inner-most 2 \n +dimensions form square matrices. \n + +*/ + REG_OP(MatrixSolveLs) .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -65,6 +210,31 @@ REG_OP(MatrixSolveLs) .ATTR(fast, Bool, true) .OP_END_FACTORY_REG(MatrixSolveLs) +/** +*@brief Solves systems of linear equations with upper or lower triangular \n +matrices by backsubstitution. + +*@par Inputs: +*The input rhs must have the same type as matrix. Inputs include: \n +*@li matrix: A Tensor. Must be one of the following types: double, float. \n +Shape is [..., M, M]. +*@li rhs:A Tensor. Must have the same type as matrix. Shape is [..., M, K]. + +*@par Attributes: +*@li lower: An optional bool. Defaults to True. Boolean indicating whether \n +the innermost matrices in matrix are lower or upper triangular. +*@li An optional bool. Defaults to False. Boolean indicating whether to solve \n +with matrix or its (block-wise) adjoint. + +*@par Outputs: +*y:A Tensor. Has the same type as matrix. + +*@attention Constraints: \n +*The input matrix is a tensor of shape [..., M, M] whose inner-most 2 \n +dimensions form square matrices. \n + +*/ + REG_OP(MatrixTriangularSolve) .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -73,6 +243,29 @@ REG_OP(MatrixTriangularSolve) .ATTR(adjoint, Bool, false) .OP_END_FACTORY_REG(MatrixTriangularSolve) +/** +*@brief Computes the QR decompositions of one or more matrices. + +*@par Inputs: +*The input shape of x must be [..., M, N]. Inputs include: \n +*x:A Tensor whose shape is [..., M, N]. Must be one of the following types: \n +double, float. + +*@par Attributes: +*full_matrices: An optional bool. Defaults to False. If true, compute \n +full-sized q and r. If false (the default), compute only the leading P \n +columns of q. + +*@par Outputs: +*@li q: A Tensor. Has the same type as x. +*@li r: A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The input matrix x is a tensor of shape [..., M, N] whose inner-most 2 \n +dimensions form matrices of size [M, N]. \n + +*/ + REG_OP(Qr) .INPUT(x, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) .OUTPUT(q, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) @@ -80,6 +273,28 @@ REG_OP(Qr) .ATTR(full_matrices, Bool, false) .OP_END_FACTORY_REG(Qr) +/** +*@brief Computes the eigen decomposition of a batch of self-adjoint matrices. + +*@par Inputs: +*The input shape of x must be [..., N, N]. Inputs include: \n +*x:Tensor of shape [..., N, N]. Only the lower triangular part of each inner \n +inner matrix is referenced. + +*@par Attributes: +*compute_v:bool. Defaults to True. + +*@par Outputs: +*@li eigen_value:Eigenvalues. Shape is [..., N]. Sorted in non-decreasing order. +*@li eigen_vector:Shape is [..., N, N]. The columns of the inner most matrices \n +contain eigenvectors of the corresponding matrices in tensor + +*@attention Constraints: \n +*The input x is a tensor of shape [..., N, N] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(SelfAdjointEig) .INPUT(x, TensorType({ DT_DOUBLE, DT_FLOAT })) .OUTPUT(eigen_value, TensorType({ DT_DOUBLE, DT_FLOAT })) @@ -87,6 +302,35 @@ REG_OP(SelfAdjointEig) .ATTR(compute_v, Bool, true) .OP_END_FACTORY_REG(SelfAdjointEig) +/** +*@brief Computes the singular value decompositions of one or more matrices. + +*@par Inputs: +*The input shape of x must be [..., N, N]. Inputs include: \n +*x:Tensor of shape [..., M, N]. Let P be the minimum of M and N. + +*@par Attributes: +*compute_uv:If True then left and right singular vectors will be computed and \n +returned in u and v, respectively. Otherwise, only the singular values will \n +be computed, which can be significantly faster. + +*@par Outputs: +*@li sigma:Singular values. Shape is [..., P]. The values are sorted in \n +reverse order of magnitude, so s[..., 0] is the largest value, s[..., 1] \n +is the second largest, etc. +*@li u:Left singular vectors. If full_matrices is False (default) then shape \n +is [..., M, P]; if full_matrices is True then shape is [..., M, M]. Not \n +returned if compute_uv is False. +*@li v:Right singular vectors. If full_matrices is False (default) then shape \n +is [..., N, P]. If full_matrices is True then shape is [..., N, N]. Not \n +returned if compute_uv is False. + +*@attention Constraints: \n +*The input x is a tensor of shape [..., N, N] whose inner-most 2 dimensions \n +form square matrices. \n + +*/ + REG_OP(Svd) .INPUT(x, TensorType({ DT_DOUBLE, DT_FLOAT })) .OUTPUT(sigma, TensorType({ DT_DOUBLE, DT_FLOAT })) @@ -95,6 +339,68 @@ REG_OP(Svd) .ATTR(compute_uv, Bool, true) .ATTR(full_matrices, Bool, false) .OP_END_FACTORY_REG(Svd) + +/** +*@brief Computes the LU decomposition of one or more square matrices. + +*@par Inputs: +*input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form \n +matrices of size `[M, M]`. + +*@par Outputs: +*@li lu: A tensor of shape `[..., M, M]` whose strictly lower triangular part \n +denotes the lower triangular factor `L` with unit diagonal. +*@li p: upper triangular part denotes the upper triangular factor `U`.Permutation \n +of the rows encoded as a list of indices in `0..M-1`. Shape is `[..., M]`. + +*/ + +REG_OP(Lu) + .INPUT(input, TensorType({DT_FLOAT, DT_DOUBLE})) + .OUTPUT(lu, TensorType({DT_FLOAT, DT_DOUBLE})) + .OUTPUT(p, TensorType({DT_INT32, DT_INT64})) + .REQUIRED_ATTR(output_idx_type, Type) + .OP_END_FACTORY_REG(Lu) + +/** +*@brief Computes the matrix square root of one or more square matrices. + +*@par Inputs: +*input: Shape is `[..., M, M]`. + +*@par Outputs: +y: Shape is `[..., M, M]`. + +*/ + +REG_OP(MatrixSquareRoot) + .INPUT(input, TensorType({DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) + .OP_END_FACTORY_REG(MatrixSquareRoot) + +/** +*@brief Solves tridiagonal systems of equations. + +*@par Inputs: +*@li diagonals: Tensor of shape `[..., 3, M]` whose innermost 2 dimensions represent the \n +tridiagonal matrices with three rows being the superdiagonal, diagonals, and \n +subdiagonals, in order. The last element of the superdiagonal and the first \n +element of the subdiagonal is ignored. +*@li rhs: Tensor of shape `[..., M, K]`, representing K right-hand sides per each \n +left-hand side. + +*@par Outputs: +y: Tensor of shape `[..., M, K]` containing the solutions + +*/ + +REG_OP(TridiagonalSolve) + .INPUT(diagonals, TensorType({DT_FLOAT, DT_DOUBLE})) + .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) + .ATTR(partial_pivoting, Bool, true) + .OP_END_FACTORY_REG(TridiagonalSolve) + } // namespace ge #endif // GE_OP_LINALG_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/logging_ops.h b/third_party/fwkacllib/inc/ops/logging_ops.h index cf012dba..2564282d 100644 --- a/third_party/fwkacllib/inc/ops/logging_ops.h +++ b/third_party/fwkacllib/inc/ops/logging_ops.h @@ -22,10 +22,37 @@ namespace ge { +/** +*@brief Provides the time since epoch in seconds. + +*@par Outputs: +*y: A Tensor of type float64. The timestamp as a double for seconds since \n +the Unix epoch. + +*@attention Constraints: \n +*The timestamp is computed when the op is executed, not when it is added to \n +the graph. + +*/ + REG_OP(Timestamp) .OUTPUT(y, TensorType({DT_DOUBLE})) .OP_END_FACTORY_REG(Timestamp) +/** +*@brief Asserts that the given condition is true. + +*@par Inputs: +*If input_condition evaluates to false, print the list of tensors in data. \n +Inputs include: \n +*@li input_condition: The condition to evaluate. +*@li input_data: The tensors to print out when condition is false. + +*@par Attributes: +*summarize: Print this many entries of each tensor. + +*/ + REG_OP(Assert) .INPUT(input_condition, TensorType{DT_BOOL}) .DYNAMIC_INPUT(input_data, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, @@ -34,16 +61,34 @@ REG_OP(Assert) .ATTR(summarize, Int, 3) .OP_END_FACTORY_REG(Assert) +/** +*@brief Prints a tensor. + +*@par Inputs: +*x: The tensor to print, it is a dynamic_input. + +*/ REG_OP(Print) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_DOUBLE, DT_STRING})) .OP_END_FACTORY_REG(Print) +/** +*@brief Prints a string scalar. + +*@par Inputs: +*The dtype of input x must be string. Inputs include: \n +*x: The string scalar to print. + +*@par Attributes: +*output_stream: A string specifying the output stream or logging level \n +to print to. + +*/ REG_OP(PrintV2) .INPUT(x, TensorType({DT_STRING})) .ATTR(output_stream, String, "stderr") .OP_END_FACTORY_REG(PrintV2) - } // namespace ge #endif // GE_OP_LOGGING_OPS_H diff --git a/third_party/fwkacllib/inc/ops/lookup_ops.h b/third_party/fwkacllib/inc/ops/lookup_ops.h index 3ca5ae46..390e50c6 100644 --- a/third_party/fwkacllib/inc/ops/lookup_ops.h +++ b/third_party/fwkacllib/inc/ops/lookup_ops.h @@ -21,47 +21,137 @@ namespace ge { +/** +*@brief Replaces the contents of the table with the specified keys and values. + +*@par Inputs: +*The dtype of input handle must be resource. Inputs include: \n +*@li handle: A Tensor of type resource. Handle to the table. +*@li keys: A Tensor. Any shape. Keys to look up. +*@li values: A Tensor. Values to associate with keys. + +*/ + REG_OP(LookupTableImport) .INPUT(handle, TensorType({DT_RESOURCE})) - .INPUT(keys, TensorType({DT_BOOL, DT_DOUBLE, \ - DT_FLOAT, DT_INT32, DT_INT64})) + .INPUT(keys, TensorType({DT_STRING, DT_INT32, DT_INT64})) .INPUT(values, TensorType({DT_BOOL, DT_DOUBLE, \ - DT_FLOAT, DT_INT32, DT_INT64})) + DT_FLOAT, DT_INT32, DT_INT64, DT_STRING})) .OP_END_FACTORY_REG(LookupTableImport) +/** +*@brief Updates the table to associates keys with values. + +*@par Inputs: +*The dtype of input handle must be resource. Inputs include: \n +*@li handle: A Tensor of type resource. Handle to the table. +*@li keys: A Tensor. Any shape. Keys to look up. +*@li values: A Tensor. Values to associate with keys. + +*@attention Constraints: \n +*@li The tensor keys must be of the same type as the keys of the table. \n +*@li The tensor values must be of the type of the table values. \n + +*/ + REG_OP(LookupTableInsert) .INPUT(handle, TensorType({DT_RESOURCE})) - .INPUT(keys, TensorType({DT_BOOL, DT_DOUBLE, DT_FLOAT, \ - DT_INT32, DT_INT64})) + .INPUT(keys, TensorType({DT_STRING, DT_INT32, DT_INT64})) .INPUT(values, TensorType({DT_BOOL, DT_DOUBLE, DT_FLOAT, \ - DT_INT32, DT_INT64})) + DT_INT32, DT_INT64, DT_STRING})) .OP_END_FACTORY_REG(LookupTableInsert) +/** +*@brief Outputs all keys and values in the table. + +*@par Inputs: +*The dtype of input handle must be resource. Inputs include: \n +*handle: A Tensor of type resource. Handle to the table. + +*@par Attributes: +*@li Tkeys: A DType. +*@li Tvalues: A DType. + +*@par Outputs: +*@li keys: A Tensor of type Tkeys. +*@li values: A Tensor of type Tvalues. + +*/ + REG_OP(LookupTableExport) .INPUT(handle, TensorType({DT_RESOURCE})) - .OUTPUT(keys, TensorType({DT_BOOL, DT_DOUBLE, DT_FLOAT, \ - DT_INT32, DT_INT64})) + .OUTPUT(keys, TensorType({DT_INT32, DT_INT64, DT_STRING})) .OUTPUT(values, TensorType({DT_BOOL, DT_DOUBLE, DT_FLOAT, \ - DT_INT32,DT_INT64})) + DT_INT32, DT_INT64, DT_STRING})) .REQUIRED_ATTR(Tkeys, Type) .REQUIRED_ATTR(Tvalues, Type) .OP_END_FACTORY_REG(LookupTableExport) + +/** +*@brief Computes the number of elements in the given table. + +*@par Inputs: +*The dtype of input handle must be resource. Inputs include: \n +*handle: A Tensor of type resource. Handle to the table. + +*@par Outputs: +*size: A Tensor of type int64. + +*/ + REG_OP(LookupTableSize) .INPUT(handle, TensorType({DT_RESOURCE})) .OUTPUT(size, TensorType({DT_INT64})) .OP_END_FACTORY_REG(LookupTableSize) +/** +*@brief Looks up keys in a table, outputs the corresponding values. + +*@par Inputs: +*The dtype of input handle must be resource. Inputs include: \n +*@li handle: A Tensor of type resource. Handle to the table. +*@li keys: A Tensor. Any shape. Keys to look up. +*@li default_value: A Tensor. + +*@par Attributes: +*Tout: Specified type of ouput values. + +*@par Outputs: +*values: A Tensor. Has the same type as default_value. + +*/ + REG_OP(LookupTableFind) .INPUT(handle, TensorType({DT_RESOURCE})) - .INPUT(keys, TensorType({DT_DOUBLE, DT_FLOAT, \ - DT_INT32, DT_INT64})) + .INPUT(keys, TensorType({DT_INT32, DT_INT64, DT_STRING})) .INPUT(default_value, TensorType({DT_DOUBLE, DT_FLOAT, \ - DT_INT32, DT_INT64})) + DT_INT32, DT_INT64, DT_STRING, DT_BOOL})) .OUTPUT(values, TensorType({DT_DOUBLE, DT_FLOAT, DT_INT32, \ - DT_INT64})) + DT_INT64, DT_STRING, DT_BOOL})) .REQUIRED_ATTR(Tout, Type) .OP_END_FACTORY_REG(LookupTableFind) +/** +*@brief Creates a non-initialized hash table. + +*@par Attributes: +*@li container: An optional string. Defaults to "". If non-empty, this table \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". If non-empty, this \n +table is shared under the given name across multiple sessions. +*@li use_node_name_sharing: An optional bool. Defaults to False. If true and \n +shared_name is empty, the table is shared using the node name. +*@li key_dtype: A DType. Type of the table keys. +*@li value_dtype: A DType. Type of the table values. + +*@par Outputs: +*handle: A Tensor of type resource. Handle to the table. + +*@attention Constraints: \n +*The implementation for HashTable on Ascend uses ai cpu, with bad performance. \n + +*/ + REG_OP(HashTable) .OUTPUT(handle, TensorType({DT_RESOURCE})) .ATTR(container, String, "") @@ -71,18 +161,60 @@ REG_OP(HashTable) .REQUIRED_ATTR(value_dtype, Type) .OP_END_FACTORY_REG(HashTable) +/** +*@brief Table initializer that takes two tensors for keys and values \n +respectively. + +*@par Inputs: +*The dtype of input handle must be resource. Inputs include: \n +*@li handle: A Tensor of type resource. Handle to a table which will be \n +initialized. +*@li keys: A Tensor. Keys of type Tkey. +*@li values: A Tensor. Values of type Tval. + +*/ + REG_OP(InitializeTable) .INPUT(handle, TensorType({DT_RESOURCE})) - .INPUT(keys, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) - .INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(keys, TensorType({DT_INT32, DT_INT64, DT_STRING})) + .INPUT(values, TensorType({DT_INT32, DT_INT64, DT_FLOAT, \ + DT_DOUBLE, DT_BOOL, DT_STRING})) .OP_END_FACTORY_REG(InitializeTable) +/** +*@brief Creates an empty hash table that uses tensors as the backing store. + +*@par Inputs: +*The input deleted_key must have the same type as empty_key. Inputs include: \n +*@li empty_key: A Tensor. The key used to represent empty key buckets \n +internally. Must not be used in insert or lookup operations. +*@li deleted_key: A Tensor. Must have the same type as empty_key. + +*@par Attributes: +*@li container: An optional string. Defaults to "". If non-empty, this table \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". If non-empty, this \n +table is shared under the given name across multiple sessions. +*@li use_node_name_sharing: An optional bool. Defaults to False. If true and \n +shared_name is empty, the table is shared using the node name. +*@li value_dtype: A DType. Type of the table values. +*@li value_shape: An optional TensorShape or list of ints. Defaults to []. \n +The shape of each value. +*@li initial_num_buckets: An optional int. Defaults to 131072. The initial \n +number of hash table buckets. Must be a power to 2. +*@li max_load_factor: An optional float. Defaults to 0.8. The maximum ratio \n +between number of entries and number of buckets before growing the table. \n +Must be between 0 and 1. + +*@par Outputs: +*handle: A Tensor of type resource. Handle to the table. + +*/ + REG_OP(MutableDenseHashTable) - .INPUT(empty_key, TensorType({DT_INT32, DT_INT64})) - .INPUT(deleted_key, TensorType({DT_INT32, DT_INT64})) - .OUTPUT(handle, TensorType({DT_RESOURSE})) + .INPUT(empty_key, TensorType({DT_INT32, DT_INT64, DT_STRING})) + .INPUT(deleted_key, TensorType({DT_INT32, DT_INT64, DT_STRING})) + .OUTPUT(handle, TensorType({DT_RESOURCE})) .ATTR(container, String, "") .ATTR(shared_name, String, "") .ATTR(use_node_name_sharing, Bool, false) @@ -92,8 +224,27 @@ REG_OP(MutableDenseHashTable) .ATTR(max_load_factor, Float, 0.8) .OP_END_FACTORY_REG(MutableDenseHashTable) +/** +*@brief Creates an empty hash table. + +*@par Attributes: +*@li container: An optional string. Defaults to "". If non-empty, this table \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". If non-empty, this \n +table is shared under the given name across multiple sessions. +*@li use_node_name_sharing: An optional bool. Defaults to False. If true and \n +shared_name is empty, the table is shared using the node name. +*@li key_dtype: A DType. Type of the table keys. +*@li value_dtype: A DType. Type of the table values. +*@li value_shape: An optional TensorShape or list of ints. Defaults to []. + +*@par Outputs: +*handle: A Tensor of type resource. Handle to the table. + +*/ + REG_OP(MutableHashTableOfTensors) - .OUTPUT(handle, TensorType({DT_RESOURSE})) + .OUTPUT(handle, TensorType({DT_RESOURCE})) .ATTR(container, String, "") .ATTR(shared_name, String, "") .ATTR(use_node_name_sharing, Bool, false) @@ -102,8 +253,26 @@ REG_OP(MutableHashTableOfTensors) .ATTR(value_shape, ListInt, {}) .OP_END_FACTORY_REG(MutableHashTableOfTensors) +/** +*@brief Creates an empty hash table. + +*@par Attributes: +*@li container: An optional string. Defaults to "". If non-empty, this table \n +is placed in the given container. Otherwise, a default container is used. +*@li shared_name: An optional string. Defaults to "". If non-empty, this \n +table is shared under the given name across multiple sessions. +*@li use_node_name_sharing: An optional bool. Defaults to False. If true and \n +shared_name is empty, the table is shared using the node name. +*@li key_dtype: A DType. Type of the table keys. +*@li value_dtype: A DType. Type of the table values. + +*@par Outputs: +*handle: A Tensor of type resource. Handle to the table. + +*/ + REG_OP(MutableHashTable) - .OUTPUT(handle, TensorType({DT_RESOURSE})) + .OUTPUT(handle, TensorType({DT_RESOURCE})) .ATTR(container, String, "") .ATTR(shared_name, String, "") .ATTR(use_node_name_sharing, Bool, false) diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h index 358d5341..aa318c94 100644 --- a/third_party/fwkacllib/inc/ops/math_ops.h +++ b/third_party/fwkacllib/inc/ops/math_ops.h @@ -22,18 +22,61 @@ namespace ge { +/** +*@brief Compute the lower regularized incomplete Gamma function P(a, x). + +*@par Inputs: +*The input a and x must have the same type. Inputs include: \n +*@li a:A Tensor. Must be one of the following types: float, double. +*@li x:A Tensor. Must have the same type as a. + +*@par Outputs: +*z:A Tensor. Has the same type as a. + +*/ + REG_OP(Igamma) .INPUT(a, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(z, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(Igamma) +/** +*@brief Compute the upper regularized incomplete Gamma function Q(a, x). + +*@par Inputs: +*The input a and x must have the same type. Inputs include: \n +*@li a:A Tensor. Must be one of the following types: float, float64. +*@li x:A Tensor. Must have the same type as a. + +*@par Outputs: +*z:A Tensor. Has the same type as a. + +*/ + REG_OP(Igammac) .INPUT(a, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(z, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(Igammac) +/** +*@brief Compare values of input to threshold and pack resulting bits into \n +a uint8. + +*@par Inputs: +*The input size must be a non-negative int32 scalar Tensor. Inputs include: \n +*@li input:Values to compare against threshold and bitpack. +*@li threshold:Threshold to compare against. + +*@par Outputs: +*y:The bitpacked comparisons. + +*@attention Constraints: \n +*Currently, the innermost dimension of the tensor must be divisible by 8. \n + +*/ + REG_OP(CompareAndBitpack) .INPUT(x, TensorType({ DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, \ DT_INT16, DT_INT32, DT_INT64, DT_BOOL })) @@ -42,6 +85,27 @@ REG_OP(CompareAndBitpack) .OUTPUT(y, TensorType(DT_UINT8)) .OP_END_FACTORY_REG(CompareAndBitpack) +/** +*@brief Counts the number of occurrences of each value in an integer array. \n +Outputs a vector with length size and the same dtype as weights. If weights \n +are empty, then index i stores the number of times the value i is counted in \n +arr. If weights are non-empty, then index i stores the sum of the value in \n +weights at each index. + +*@par Inputs: +*The input size must be a non-negative int32 scalar Tensor. Inputs include: \n +*@li array:int32 Tensor. +*@li size:non-negative int32 scalar Tensor. +*@li weights: is an int32, int64, float32, or double Tensor with the same \n +shape as arr, or a length-0 Tensor, in which case it acts as all weights \n +equal to 1. + +*@par Outputs: +*bins:1D Tensor with length equal to size. The counts or summed weights for \n +each value in the range [0, size). + +*/ + REG_OP(Bincount) .INPUT(array, TensorType(DT_INT32)) .INPUT(size, TensorType(DT_INT32)) @@ -49,6 +113,20 @@ REG_OP(Bincount) .OUTPUT(bins, TensorType({ DT_FLOAT, DT_INT32, DT_INT64, DT_DOUBLE })) .OP_END_FACTORY_REG(Bincount) +/** +*@brief Compute the regularized incomplete beta integral. + +*@par Inputs: +*The input b and x must have the same types as a. Inputs include: \n +*@li a:A Tensor. Must be one of the following types: float32, double. +*@li b:A Tensor. Must have the same type as a. +*@li x:A Tensor. Must have the same type as a. + +*@par Outputs: +*z:A Tensor. Has the same type as a. + +*/ + REG_OP(Betainc) .INPUT(a, TensorType({DT_DOUBLE, DT_FLOAT})) .INPUT(b, TensorType({DT_DOUBLE, DT_FLOAT})) @@ -56,18 +134,68 @@ REG_OP(Betainc) .OUTPUT(z, TensorType({DT_DOUBLE, DT_FLOAT})) .OP_END_FACTORY_REG(Betainc) +/** +*@brief Compute the Hurwitz zeta function + +*@par Inputs: +*The input q must be the same type as x. Inputs include: \n +*@li x:A Tensor. Must be one of the following types: float32, double. +*@li q:A Tensor. Must have the same type as x. + +*@par Outputs: +*z:A Tensor. Has the same type as x. + +*@attention Constraints: \n +*The implementation for Zeta on Ascend uses ai cpu, with bad performance. \n + +*/ + REG_OP(Zeta) .INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT})) .INPUT(q, TensorType({DT_DOUBLE, DT_FLOAT})) .OUTPUT(z, TensorType({DT_DOUBLE, DT_FLOAT})) .OP_END_FACTORY_REG(Zeta) +/** +*@brief Bucketizes 'input' based on 'boundaries'. For example, if the inputs \n +are boundaries = [0, 10, 100] input = [[-5, 10000] [150, 10] [5, 100]] then \n +the output will be output = [[0, 3] [3, 2] [1, 3]] + +*@par Inputs: +*The dtype of input x must be int or float. Inputs include: \n +*x:Any shape of Tensor contains with int or float type. + +*@par Attributes: +*boundaries:A sorted list of floats gives the boundary of the buckets. + +*@par Outputs: +*y:Same shape with 'input', each value of input replaced with bucket index. + +*/ + REG_OP(Bucketize) .INPUT(x, TensorType({DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT})) .OUTPUT(y, TensorType({DT_INT32})) .REQUIRED_ATTR(boundaries, ListFloat) .OP_END_FACTORY_REG(Bucketize) +/** +*@brief Computes the sum along sparse segments of a tensor. + +*@par Inputs: +*The input indices and segment_ids must have same rank. Inputs include: \n +*@li x:A Tensor. Must be one of the following types: float, double, int32, \n +uint8, int16, int8, int64, uint16, uint32, uint64. +*@li indices: A Tensor. Must be one of the following types: int32, int64. \n +A 1-D tensor. Has same rank as segment_ids. +*@li segment_ids: A Tensor of type int32. A 1-D tensor. Values should be \n +sorted and can be repeated. + +*@par Outputs: +*y:A Tensor. Has the same type as x. + +*/ + REG_OP(SparseSegmentSum) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) @@ -77,6 +205,22 @@ REG_OP(SparseSegmentSum) DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) .OP_END_FACTORY_REG(SparseSegmentSum) +/** +*@brief Computes the mean along sparse segments of a tensor. + +*@par Inputs: +*The input indices and segment_ids must have same rank. Inputs include: \n +*@li x: A Tensor. Must be one of the following types: float, double. +*@li indices: A Tensor. Must be one of the following types: int32, int64. \n +A 1-D tensor. Has same rank as segment_ids. +*@li segment_ids: A Tensor of type int32. A 1-D tensor. Values should be \n +sorted and can be repeated. + +*@par Outputs: +*y:A Tensor. Has the same type as x. + +*/ + REG_OP(SparseSegmentMean) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(indices, TensorType({DT_INT32})) @@ -84,6 +228,25 @@ REG_OP(SparseSegmentMean) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(SparseSegmentMean) +/** +*@brief Computes gradients for SparseSegmentMean. + +*@par Inputs: +*The input grad must have be type float or double. Inputs include: \n +*@li grad: A Tensor. Must be one of the following types: float, double. \n +gradient propagated to the SparseSegmentMean op. +*@li indices: A Tensor. Must be one of the following types: int32, int64. \n +indices passed to the corresponding SparseSegmentMean op. +*@li segment_ids: A Tensor of type int32. segment_ids passed to the \n +corresponding SparseSegmentMean op. +*@li output_dim0: A Tensor of type int32. dimension 0 of "x" passed to \n +SparseSegmentMean op. + +*@par Outputs: +*y:A Tensor. Has the same type as grad. + +*/ + REG_OP(SparseSegmentMeanGrad) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(indices, TensorType({DT_INT32})) @@ -92,16 +255,52 @@ REG_OP(SparseSegmentMeanGrad) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(SparseSegmentMeanGrad) +/** +*@brief Computes the gradient of igamma(a, x) wrt a + +*@par Inputs: +*The input a and x must have the same type. Inputs include: \n +*@li a:A Tensor. Must be one of the following types: float32, double. +*@li x:A Tensor. Must have the same type as a. + +*@par Outputs: +*y:A Tensor. Has the same type as a. + +*/ + REG_OP(IgammaGradA) .INPUT(a, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(z, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(IgammaGradA) +/** +*@brief Initialize data process channel. + +*@par Attributes: +*channel_name: A string. Default "". + +*/ + REG_OP(InitData) .ATTR(channel_name, String, "") .OP_END_FACTORY_REG(InitData) +/** +*@brief Get the next batch of data in data processing. + +*@par Attributes: +*@li output_types: A nested structure of DType objects corresponding to each \n +component of an element of this dataset. +*@li output_shapes: A nested structure of TensorShape objects corresponding \n +to each component of an element of this dataset. +*@li channel_name: A string. Default "". + +*@par Outputs: +*y:A nested structure of Tensor objects. + +*/ + REG_OP(GetNext) .DYNAMIC_OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL})) @@ -118,6 +317,7 @@ REG_OP(GetNext) *@par Outputs: *y: A Tensor. Has the same type as "x". + */ REG_OP(Erf) .INPUT(x, TensorType::FloatingDataType()) @@ -132,6 +332,7 @@ REG_OP(Erf) *@par Outputs: *y: A Tensor. Has the same type as "x". + */ REG_OP(Erfc) .INPUT(x, TensorType::FloatingDataType()) @@ -154,6 +355,7 @@ REG_OP(Erfc) *@par Outputs: *y: A Tensor. A Tensor of type int32. + */ REG_OP(HistogramFixedWidth) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32})) @@ -179,6 +381,7 @@ REG_OP(HistogramFixedWidth) *@par Outputs: *y: A Tensor. A Tensor of type int32. + */ REG_OP(HistogramFixedWidthD) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32})) @@ -187,6 +390,41 @@ REG_OP(HistogramFixedWidthD) .REQUIRED_ATTR(nbins, Int) .ATTR(dtype, String, "int32") .OP_END_FACTORY_REG(HistogramFixedWidthD) + +/** +*@brief Returns the next representable value of x1 in the direction of x2, element-wise. + +*@par Inputs: +*The input X1 and x2 must have the same type. Inputs include: \n +*@li x1:A Tensor. Must be one of the following types: float32, double. +*@li x2:A Tensor. Must have the same type as x1. + +*@par Outputs: +*output:A Tensor. Has the same type as x1. + +*/ +REG_OP(NextAfter) + .INPUT(x1, TensorType({DT_FLOAT, DT_DOUBLE})) + .INPUT(x2, TensorType({DT_FLOAT, DT_DOUBLE})) + .OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE})) + .OP_END_FACTORY_REG(NextAfter) + +/** + * *@brief Compute element-wise finiteness, return a boolean tensor. + * + * *@par Inputs: + * *x:A Tensor. + * + * *@par Outputs: + * *y:A Tensor. Has the same shape as x. + * + * */ +REG_OP(IsFinite) + .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, + DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, + DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_BOOL})) + .OUTPUT(y, TensorType({DT_BOOL})) + .OP_END_FACTORY_REG(IsFinite) } // namespace ge #endif // GE_OP_MATH_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h index f5045786..597a8982 100644 --- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h @@ -46,18 +46,40 @@ REG_OP(MatMul) .INPUT(x2, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) - .ATTR(transpose_a, Bool, false) - .ATTR(transpose_b, Bool, false) + .ATTR(transpose_x1, Bool, false) + .ATTR(transpose_x2, Bool, false) .OP_END_FACTORY_REG(MatMul) -REG_OP(MatMulV2) - .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT16, DT_INT8, DT_INT8})) - .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT16, DT_INT8, DT_INT8})) - .INPUT(alpha, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_FLOAT})) - .INPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_FLOAT})) - .INPUT(bias, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_FLOAT})) - .OP_END_FACTORY_REG(MatMulV2) +/** +*@brief Performs Matrix-to-matrix Multiply, producing c=alpha[0]*a*b+beta[0]*c. + +*@par Inputs: +*Five inputs, including: +*@li a: A matrix Tensor. 4D. Must be one of the following types:\n float16, int8. Has format [FRACTAL_NZ]. +*@li b: A matrix Tensor. 4D. Must be one of the following types:\n float16, int8. When type is int8, has format [FRACTAL_Z], \n otherwise has format [FRACTAL_NZ]. +*@li c: A matrix Tensor. 2D or higher. Must be one of the following types: \n float16, int32, float32. When type is int32, has format [ND], \n otherwise has format [FRACTAL_NZ]. +*@li alpha: A 1D Tensor. The shape of alpha is [1].\n Must be one of the following types: float16, int32, float32. Has format [ND]. +*@li beta: A 1D Tensor. The shape of beta is [1].\n Must be one of the following types: float16, int32, float32. Has format [ND]. + +*@par Attributes: +*Two attributes, including: +*@li transpose_a: Optional. A bool.\n If True, changes the shape of "a" from [M, K] to [K, M].\n Reserved parameters, not used for now. +*@li transpose_b: Optional. A bool.\n If True, changes the shape of "b" from [M, K] to [K, M].\n Reserved parameters, not used for now. + +*@par Outputs: +*@out: The result matrix Tensor. 4D. Must be one of the following types:\n float16, float32, int32. Has format [FRACTAL_NZ]. +*/ + +REG_OP(Gemm) + .INPUT(a, TensorType({DT_FLOAT16, DT_INT8})) + .INPUT(b, TensorType({DT_FLOAT16, DT_INT8})) + .INPUT(c, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) + .INPUT(alpha, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) + .INPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) + .OUTPUT(out, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) + .ATTR(transpose_a, Bool, false) + .ATTR(transpose_b, Bool, false) + .OP_END_FACTORY_REG(Gemm) /** *@brief Multiplies matrix "a" by matrix "b", producing "a * b". @@ -77,12 +99,13 @@ REG_OP(MatMulV2) *y: The result matrix Tensor. 2D or higher. Must be one of the following types: float16, * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ]. Has the same shape length as "x1" and "x2". */ + REG_OP(BatchMatMul) .INPUT(x1, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .INPUT(x2, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) - .ATTR(adj_x, Bool, false) - .ATTR(adj_y, Bool, false) + .ATTR(adj_x1, Bool, false) + .ATTR(adj_x2, Bool, false) .OP_END_FACTORY_REG(BatchMatMul) REG_OP(MeanCCE) @@ -145,34 +168,103 @@ REG_OP(L2Loss) .OUTPUT(y, TensorType::FloatingDataType()) .OP_END_FACTORY_REG(L2Loss) +/** +*@brief: Returns a batched diagonal tensor with a given batched diagonal values. + +*@par Inputs: +*x: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(MatrixDiag) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(MatrixDiag) +/** +*@brief: Returns a batched diagonal tensor with a given batched diagonal values. + +*@par Inputs: +* Two inputs, including: +*@li x: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. +*@li assist: A Tensor of the same type as "x". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(MatrixDiagD) .INPUT(x, TensorType::BasicType()) .INPUT(assist, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(MatrixDiagD) +/** +*@brief: Returns the batched diagonal part of a batched tensor. + +*@par Inputs: +*x: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(MatrixDiagPart) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(MatrixDiagPart) +/** +*@brief: Returns the batched diagonal part of a batched tensor. + +*@par Inputs: +* Two inputs, including: +*@li x: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. +*@li assist: A Tensor of the same type as "x". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(MatrixDiagPartD) .INPUT(x, TensorType::BasicType()) .INPUT(assist, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(MatrixDiagPartD) +/** +*@brief: Returns a batched matrix tensor with new batched diagonal values. + +*@par Inputs: +* Two inputs, including: +*@li x: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. +*@li diagonal: A Tensor of the same type as "x". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(MatrixSetDiag) .INPUT(x, TensorType::BasicType()) .INPUT(diagonal, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(MatrixSetDiag) +/** +*@brief: Returns a batched matrix tensor with new batched diagonal values. + +*@par Inputs: +* Three inputs, including: +*@li x: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. +*@li diagonal: A Tensor of the same type as "x". +*@li assist: A Tensor of the same type as "x". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(MatrixSetDiagD) .INPUT(x, TensorType::BasicType()) .INPUT(diagonal, TensorType::BasicType()) @@ -180,6 +272,28 @@ REG_OP(MatrixSetDiagD) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(MatrixSetDiagD) +/** +*@brief Applies sparse "updates" to individual values or slices in a Variable. + +*@par Inputs: +* Three inputs, including: +*@li var: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int8, uint8, bool +*@li indices: An ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int8, uint8, bool + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterNdUpdate) .INPUT(var, TensorType::BasicType()) .INPUT(indices, TensorType::IndexNumberType()) @@ -188,6 +302,28 @@ REG_OP(ScatterNdUpdate) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterNdUpdate) +/** +*@brief Adds sparse "updates" to a variable reference. + +*@par Inputs: +* Three inputs, including: +*@li var: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An ND Tensor of type int32. + + +*@li updates: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterAdd) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType::IndexNumberType()) @@ -196,6 +332,29 @@ REG_OP(ScatterAdd) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterAdd) +/** +*@brief Divides a variable reference by sparse updates. + +*@par Inputs: +* Three inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*@li use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. +*@li isRef: An optional bool. Defaults to "True" + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterDiv) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType({DT_INT32})) @@ -204,6 +363,28 @@ REG_OP(ScatterDiv) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterDiv) +/** +*@brief Applies sparse addition to individual values or slices in a Variable. + +*@par Inputs: +* Three inputs, including: +*@li var: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterNdAdd) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType::IndexNumberType()) @@ -212,6 +393,28 @@ REG_OP(ScatterNdAdd) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterNdAdd) +/** +*@brief Applies sparse subtraction to individual values or slices in a Variable. + +*@par Inputs: +* Three inputs, including: +*@li var: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterNdSub) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType::IndexNumberType()) @@ -220,6 +423,28 @@ REG_OP(ScatterNdSub) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterNdSub) +/** +*@brief Subtracts sparse updates to a variable reference. + +*@par Inputs: +* Three inputs, including: +*@li var: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterSub) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType::IndexNumberType()) @@ -228,12 +453,34 @@ REG_OP(ScatterSub) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterSub) +/** +*@brief: Returns the batched diagonal part of a batched tensor with "assist". + +*@par Inputs: +* Two inputs, including: +* @li x: A Tensor of type float16, float32, or int32. +* @li assist: A Tensor of the same type as "x". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(DiagPartD) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .INPUT(assist, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32})) .OP_END_FACTORY_REG(DiagPartD) +/** +*@brief: Returns the batched diagonal part of a batched tensor. + +*@par Inputs:\n +*x: A Tensor. Must be one of the following types: float16, float32, int32, int64, double, complex64, complex128. + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(DiagPart) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) @@ -241,6 +488,26 @@ REG_OP(DiagPart) DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(DiagPart) +/** +*@brief Also known as a "fully-connected" layer, computes an inner product with a set of learned weights, and (optionally) adds biases. + +*@par Inputs: +* Two inputs, including: +*@li x: A Tensor of type float16, int8. +*@li w: A weight matrix of type float16, int8. +*@li b: A Tensor of type float16, int32. +*@li offset_w: A Tensor of type int8. + +*@par Attributes: +*@li num_output: Reserved. +*@li transpose: A bool, specifying whether to transpose, either "true" or "false". Defaults to "false". +*@li bias_term: A bool, specifying whether to learn and apply a set of additive biases to the filter outputs, either "true" or "false". Defaults to "true". +*@li axis: only support axis is 1. Defaults to "1". +*@li offset_a: A type of Int, Defaults to "1". + +*@par Outputs: +*y: The result tensor of type float16, int8. +*/ REG_OP(InnerProduct) .INPUT(x, TensorType({DT_FLOAT16, DT_INT8})) .INPUT(w, TensorType({DT_FLOAT16, DT_INT8})) @@ -254,6 +521,29 @@ REG_OP(InnerProduct) .ATTR(offset_a, Int, 0) .OP_END_FACTORY_REG(InnerProduct) +/** +*@brief Computes the confusion matrix from predictions and labels. + +*@par Inputs: +* Three inputs, including: +*@li labels: A Tensor. Must be one of the following types: float16, float32, int32, int8. +*@li predictions: A Tensor. Must be one of the following types: float16, float32, int32, int8. +*@li weights: A Tensor. Must be one of the following types: float16, float32, int32, int8. + +*@par Attributes: +*@li num_classes: An integer for the shape of the output matrix. No default value. +*@li dtype: Data type of the confusion matrix. No default value. + +*@par Outputs: +*y: A Tensor. Has the same type and format as input "labels" + +*@attention Constraints: +*@li "weights", "labels", and "predictions" are 1D tensors. +*@li The output is with shape (num_classes, num_classes), where, 1 <= num_classes <= 4096. + +*@see Region() + +*/ REG_OP(ConfusionMatrix) .INPUT(labels, TensorType({DT_FLOAT, DT_INT32, DT_FLOAT16, DT_INT8, DT_UINT8})) .INPUT(predictions, TensorType({DT_FLOAT, DT_INT32, DT_FLOAT16, DT_INT8, DT_UINT8})) @@ -263,6 +553,28 @@ REG_OP(ConfusionMatrix) .REQUIRED_ATTR(dtype, String) .OP_END_FACTORY_REG(ConfusionMatrix) +/** +*@brief Multiplies sparse updates into a variable reference. + +*@par Inputs: +* Three inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterMul) .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType({DT_INT32})) @@ -271,6 +583,28 @@ REG_OP(ScatterMul) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterMul) +/** +*@brief Reduces sparse updates into a variable reference using the "min" operation. + +*@par Inputs: +* Three inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32 +*@li indices: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterMin) .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32})) .INPUT(indices, TensorType({DT_INT32})) @@ -279,6 +613,28 @@ REG_OP(ScatterMin) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterMin) +/** +*@brief Reduces sparse updates into a variable reference using the "max" operation. + +*@par Inputs: +* Three inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32 +*@li indices: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterMax) .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32})) .INPUT(indices, TensorType({DT_INT32})) @@ -287,6 +643,28 @@ REG_OP(ScatterMax) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ScatterMax) +/** +*@brief Applies sparse updates to a variable reference. + +*@par Inputs: +* Three inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 +*@li indices: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: int32 +*@li updates: An NCHW, NHWC, or ND Tensor. \n + +*Must be one of the following types: float16, float32, int32, int8, uint8 + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ REG_OP(ScatterUpdate) .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8})) .INPUT(indices, TensorType({DT_INT32})) diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h index fca85035..4b5c5f23 100644 --- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h @@ -107,6 +107,21 @@ REG_OP(FusedBatchNormGrad) .ATTR(momentum, Float, 0.0) .OP_END_FACTORY_REG(FusedBatchNormGrad) +/** +*@brief Normalizes elements of a specific dimension of eigenvalues (L2). + +*@par Inputs: +*One input: \n +*x: A multi-dimensional Tensor of type float16 or float32, specifying the eigenvalue. + +*@par Attributes: +*@li axis: A required attribute of type list, specifying the axis for normalization. +*@li eps: An optional attribute of type float, specifying the lower limit of normalization. Defaults to "1e-4". + +*@par Outputs: \n +*y: A multi-dimensional Tensor of type float16 or float32, specifying the eigenvalue for normalization. + +*/ REG_OP(L2Normalize) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) @@ -114,6 +129,23 @@ REG_OP(L2Normalize) .ATTR(eps, Float, 1e-4) .OP_END_FACTORY_REG(L2Normalize) +/** +*@brief Performs the backpropagation of L2Normalize for training scenarios. + +*@par Inputs: +* Three inputs, including: \n +*@li x: A multi-dimensional Tensor of type float16 or float32, specifying the eigenvalue of forward inputs. +*@li y: A multi-dimensional Tensor of type float16 or float32, specifying the normalization result of the forward output. +*@li dy: A multi-dimensional Tensor of type float16 or float32, specifying the reverse input gradient. + +*@par Attributes: +*@li axis: A required attribute of type int, specifying the axis to be normalized. +*@li eps: An optional attribute of type float, specifying the lower limit of normalization. Defaults to "1e-4". + +*@par Outputs: +*dx: Reverse gradient of eigenvalue "x". Has the same dimensions as "x". + +*/ REG_OP(L2NormalizeGrad) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) .INPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) @@ -123,6 +155,35 @@ REG_OP(L2NormalizeGrad) .ATTR(eps, Float, 0.0001) .OP_END_FACTORY_REG(L2NormalizeGrad) +/** +*@brief Performs batch normalization. + +*@par Inputs:\n +* Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported) +*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D. +*@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the scaling factor. +*@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the offset. +*@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the operation is used for training. +*@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None" if the operation is used for training. + +*@par Attributes: +*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001". +*@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC". +*@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True". + +*@par Outputs:\n +* Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported) +*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D. +*@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x". +*@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x". +*@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output. +*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output. + +*@attention Constraints: +*@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available, then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has the same value as "variance". +*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction. + +*/ REG_OP(BatchNorm) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(scale, TensorType({DT_FLOAT})) @@ -134,12 +195,40 @@ REG_OP(BatchNorm) .OUTPUT(batch_variance, TensorType({DT_FLOAT})) .OUTPUT(reserve_space_1, TensorType({DT_FLOAT})) .OUTPUT(reserve_space_2, TensorType({DT_FLOAT})) - .OUTPUT(reserve_space_3, TensorType({DT_FLOAT})) .ATTR(epsilon, Float, 0.0001) .ATTR(data_format, String, "NHWC") .ATTR(is_training, Bool, true) .OP_END_FACTORY_REG(BatchNorm) +/** +*@brief Performs batch normalization. + +*@par Inputs:\n +* Five inputs, including: (NHWC or NCHW supported) +*@li x: A 4D Tensor of type float16 or float32. +*@li scale: A 1D Tensor of type float32, for the scaling factor. +*@li offset: A 1D Tensor of type float32, for the scaling offset. +*@li mean: A 1D Tensor of type float32, for the mean used for inference. \n Must be "None" if the operation is used for training. +*@li variance: A 1D Tensor of type float32, for the variance used for inference. \n Must be "None" if the operation is used for training. + +*@par Attributes: +*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001". +*@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC". +*@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True". + +*@par Outputs:\n +* Five outputs, including: (NHWC or NCHW supported) +*@li y: A 4D Tensor of type float16 or float32, for the normalized "x". +*@li batch_mean: A 1D Tensor of type float32, for the mean of "x". +*@li batch_variance: A 1D Tensor of type float32, for the variance of "x". +*@li reserve_space_1: A 1D Tensor of type float32, for the mean of "x" for gradient computation. +*@li reserve_space_2: A 1D Tensor of type float32, for the variance of "x" for gradient computation. + +*@attention Constraints: +*@li If the operation is used for inference, then output "reserve_space_1" has the same value as "mean" and output "reserve_space_2" has the same value as "variance". +*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction. + +*/ REG_OP(BatchNormExt2) .INPUT(input_x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(input_scale, TensorType({DT_FLOAT})) @@ -156,13 +245,40 @@ REG_OP(BatchNormExt2) .ATTR(is_training, Bool, true) .OP_END_FACTORY_REG(BatchNormExt2) +/** +*@brief Performs the backpropagation of BatchNorm. + +*@par Inputs: +* Five inputs, including: \n +*@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the gradient. +*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0. +*@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. +*@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm. +*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm. + +*@par Attributes: +*@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x". +*@li data_format: An optional string. Defaults to "NHWC". +*@li is_training: An optional bool. Defaults to "true". Specifies the operation is for training (default) or inference. + +*@par Outputs: +*@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "x". +*@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "scale". +*@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "offset". +*@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output. +*@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output. + +*@attention Constraints: +* The preceding layer of this operator must be operator BatchNorm. + +*@see BatchNorm +*/ REG_OP(BatchNormGrad) .INPUT(y_backprop, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(scale, TensorType({DT_FLOAT})) .INPUT(reserve_space_1, TensorType({DT_FLOAT})) .INPUT(reserve_space_2, TensorType({DT_FLOAT})) - .INPUT(reserve_space_3, TensorType({DT_FLOAT})) .OUTPUT(x_backprop, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(scale_backprop, TensorType({DT_FLOAT})) .OUTPUT(offset_backprop, TensorType({DT_FLOAT})) @@ -173,6 +289,34 @@ REG_OP(BatchNormGrad) .ATTR(is_training, Bool, true) .OP_END_FACTORY_REG(BatchNormGrad) +/** +*@brief Performs the backpropagation of BatchNorm. + +*@par Inputs: +* Five inputs, including: \n +*@li y_backprop: A 4D Tensor of type float16 or float32, with format NHWC or NCHW, for the gradient. +*@li x: A 4D Tensor of type float16 or float32, with format NHWC or NCHW. +*@li scale: A 4D Tensor of type float32, with format NHWC or NCHW. +*@li reserve_space_1: A 4D Tensor of type float32, with format NHWC or NCHW. It is an output of BatchNormExt2. +*@li reserve_space_2: A 4D Tensor of type float32, with format NHWC or NCHW. It is an output of BatchNormExt2. + +*@par Attributes: +*@li epsilon: A required float32. A small float number added to the variance of "x". +*@li data_format: A required string for the format. +*@li is_training: A required bool for specifying the operation is for training (true) or inference (false). + +*@par Outputs: +*@li x_backprop: A Tensor of type float16 or float32, with format NHWC or NCHW, for the offset of "x". +*@li scale_backprop: A Tensor of type float32, with format NHWC or NCHW, for the offset of "scale". +*@li offset_backprop: A Tensor of type float32, with format NHWC or NCHW, for the offset of "offset". +*@li reserve_space_3: A Tensor of type float32, with format NHWC or NCHW. +*@li reserve_space_4: A Tensor of type float32, with format NHWC or NCHW. + +*@attention Constraints: +* The preceding layer of this operator must be BatchNormExt2. + +*@see BatchNormExt2 +*/ REG_OP(BatchNormGradExt2) .INPUT(y_backprop, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) @@ -189,30 +333,92 @@ REG_OP(BatchNormGradExt2) .OUTPUT(reserve_space_4, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(BatchNormGradExt2) -REG_OP(BninferenceD) + +/** +*@brief Performs batch normalization. + +*@par Inputs: +*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D. +*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" Specifies the mean used for inference. +*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" Specifies the variance used for inference. +*@li momentum: An optional string, input x's Scale factor +*@li scale: no use +*@li offset: no use +*@par Attributes: +*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001". +*@li use_global_stats: mean inference mode , only can be "True". +*@li mode: An optional input, not use +*@par Outputs:\n +*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x" +*/ +REG_OP(BNInference) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(mean, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(variance, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(momentum, TensorType({DT_FLOAT16,DT_FLOAT})) .OPTIONAL_INPUT(scale, TensorType({DT_FLOAT16,DT_FLOAT})) - .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16,DT_FLOAT})) + .OPTIONAL_INPUT(offset, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) - .ATTR(scale_factor, Float,0.999) .ATTR(epsilon, Float,1e-5f) - .ATTR(moving_average_fraction, Float,0.999) .ATTR(use_global_stats, Bool,true) - .OP_END_FACTORY_REG(BninferenceD) -REG_OP(Bninference) + .ATTR(mode, Int,1) + .OP_END_FACTORY_REG(BNInference) +/** +*@brief aicpu batch normalization host . + +*@par Inputs: + +*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" Specifies the mean used for inference. +*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" Specifies the variance used for inference. +*@li momentum: An optional float, input x's Scale factor +*@par Attributes: +*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001". +*@li use_global_stats: mean inference mode , only can be "True". +*@li mode: An optional inpout, not use +*@par Outputs: +*@li alpha: A Tensor of type float16 or float32 for the cpu calculate mean +*@li beta: A Tensor of type float16 or float32 for the cpu calculate variance +*/ +REG_OP(BnHost) + .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(momentum, TensorType({DT_FLOAT16,DT_FLOAT})) + .ATTR(epsilon, Float, 0.00001) + .ATTR(mode, Int, 1) + .ATTR(use_global_stats, Bool, true) + .OUTPUT(alpha, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(mu, TensorType({DT_FLOAT16,DT_FLOAT})) + .OP_END_FACTORY_REG(BnHost) +/** +*@brief Performs batch normalization. + +*@par Inputs: +*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D. +*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" Specifies the mean used for inference. +*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" Specifies the variance used for inference. +*@li momentum: An optional float, input x's Scale factor +*@li scale: no use +*@li offset: no use +*@par Attributes: +*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001". +*@li use_global_stats: mean inference mode , only can be "True". +*@li mode: An optional inpout, not use +*@par Outputs:\n +*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x" +*/ +REG_OP(BNInferenceD) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(mean, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(variance, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(scale_factor, TensorType({DT_FLOAT16,DT_FLOAT})) .OPTIONAL_INPUT(scale, TensorType({DT_FLOAT16,DT_FLOAT})) .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) + .ATTR(momentum, Float,0.9) .ATTR(epsilon, Float,1e-5f) - .ATTR(moving_average_fraction, Float,0.999) .ATTR(use_global_stats, Bool,true) - .OP_END_FACTORY_REG(Bninference) + .ATTR(mode, Int,1) + .OP_END_FACTORY_REG(BNInferenceD) } // namespace ge diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h index 175e6e2a..1be85a0e 100644 --- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h @@ -21,27 +21,28 @@ namespace ge { /** -* @brief Computes the gradients of depthwise convolution with respect to the -* filter. +* @brief Computes the gradients of depthwise convolution with respect to +* the filter. * @par Inputs: * Three inputs include: \n * @li input: 4D origin shape of input tensor [N, C, H, W] or [N, H, W, C], * support float16, float32, double * @li filter_size: A 4D tensor of type int32, with shape [H, W, C, K] -* @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C]. Must be -* one of the following types: float16, float32, double. +* @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C]. +* Must be one of the following types: float16, float32, double. * @par Attributes: -* @li strides: An optional list or tuple. The stride of the sliding window for -* height and width of input "x" of the convolution. -* Must be with shape [1, 1, stride_height, stride_width] or [1, stride_height, -* stride_width, 1]. +* @li strides: A required list or tuple. The stride of the sliding window +* for height and width of input "x" of the convolution. +* Must be with shape [1, 1, stride_height, stride_width] or +* [1, stride_height, stride_width, 1]. * @li dilations: An optional list or tuple. The dilation factor for each -* dimension of input "x". If set to k > 1, there will be k-1 skipped cells -* between each filter element on that dimension. Must be with shape [1, 1, -* dilation_height, dilation_width] or [1, dilation_height, dilation_width, 1]. -* @li pads: An optional list or tuple. Padding added to each dimension of the +* dimension of input "x". +* If set to k > 1, there will be k-1 skipped cells between each filter element +* on that dimension. Must be with shape [1, 1, dilation_height, dilation_width] +* or [1, dilation_height, dilation_width, 1]. +* @li pads: A required list or tuple. Padding added to each dimension of the * input. * @li data_format: An optional string. Input data format, either "NHWC" or * "NCHW". @@ -61,40 +62,41 @@ namespace ge { * data is 5D with shape [N, C1, Ho, Wo, C0], * where C is the same as that of the feature map and C0 is 16.\n * Limited by Tiling and L1 / L0 buffer memory: 512 * ceil(Wo, 16) + (480 * -* stride_h + 32 * filter_h) * ceil(Wi, 16) ≤ l1_size and Hf*Wf ≤ l0b_size/512.\n +* stride_h + 32 * filter_h) * ceil(Wi, 16) ?l1_size and Hf*Wf ?l0b_size/512.\n */ REG_OP(DepthwiseConv2DBackpropFilter) .INPUT(input, TensorType({float16})) .INPUT(filter_size, TensorType({DT_INT32, DT_INT64})) .INPUT(out_backprop, TensorType({float16})) .OUTPUT(filter_grad, TensorType({float32})) - .ATTR(strides, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(strides, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) - .ATTR(pads, ListInt, {0, 0, 0, 0}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(DepthwiseConv2DBackpropFilter) /** -* @brief Computes the gradients of depthwise convolution with respect to the -* filter. +* @brief Computes the gradients of depthwise convolution with respect to +* the filter. * @par Inputs: * Two inputs include: \n * @li input: 4D tensor with shape [N, C, H, W] or [N, H, W, C], of type float16 -* @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C], of type -* float16 +* @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C], +* of type float16 * @par Attributes: -* @li filter_size: An optional list or tuple. Shape of filter. -* @li strides: An optional list or tuple. The stride of the sliding window for +* @li filter_size: A required list or tuple. Shape of filter. +* @li strides: A required list or tuple. The stride of the sliding window for * height and width of input "x" of the convolution. * Must be with shape [1, 1, stride_height, stride_width] or [1, stride_height, * stride_width, 1]. * @li dilations: An optional list or tuple. The dilation factor for each -* dimension of input "x". If set to k > 1, there will be k-1 skipped cells -* between each filter element on that dimension. Must be with shape [1, 1, -* dilation_height, dilation_width] or [1, dilation_height, dilation_width, 1]. -* @li pads: An optional list or tuple. Padding added to each dimension of the +* dimension of input "x". +* If set to k > 1, there will be k-1 skipped cells between each filter element +* on that dimension. Must be with shape [1, 1, dilation_height, dilation_width] +* or [1, dilation_height, dilation_width, 1]. +* @li pads: A required list or tuple. Padding added to each dimension of the * input. * @li data_format: An optional string. Input data format, either "NHWC" or * "NCHW". @@ -107,21 +109,22 @@ REG_OP(DepthwiseConv2DBackpropFilter) * The feature map is 4D with shape [N, C, Hi, Wi] or [N, Hi, Wi, C], but * the data is 5D with shape [N, C1, Hi, Wi, C0], where C0 is 16.\n * The filter is 4D with shape [Hf, Wf, C, K], but the data is 6D with shape -* [C1, Hf, Wf, K, Co, C0], where K is fixed at 1, and Co and C0 are 16.\n +* [C1, Hf, Wf, K, Co, C0], +* where K is fixed at 1, and Co and C0 are 16.\n * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the -* data is 5D with shape [N, C1, Ho, Wo, C0], where C is the same as that of the -* feature map and C0 is 16.\n +* data is 5D with shape [N, C1, Ho, Wo, C0], +* where C is the same as that of the feature map and C0 is 16.\n * Limited by Tiling and L1 / L0 buffer memory: 512 * ceil(Wo, 16) + (480 * -* stride_h + 32 * filter_h) * ceil(Wi, 16) ≤ l1_size and Hf*Wf ≤ l0b_size/512.\n +* stride_h + 32 * filter_h) * ceil(Wi, 16) ?l1_size and Hf*Wf ?l0b_size/512.\n */ REG_OP(DepthwiseConv2DBackpropFilterD) .INPUT(input, TensorType({float16})) .INPUT(out_backprop, TensorType({float16})) .OUTPUT(filter_grad, TensorType({float32})) - .ATTR(filter_size, ListInt, {1, 1, 1, 1}) - .ATTR(strides, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(filter_size, ListInt) + .REQUIRED_ATTR(strides, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) - .ATTR(pads, ListInt, {0, 0, 0, 0}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(DepthwiseConv2DBackpropFilterD) @@ -139,15 +142,16 @@ REG_OP(DepthwiseConv2DBackpropFilterD) * Must be one of the following types: float16, float32, double. * @par Attributes: -* @li strides: An optional list or tuple. The stride of the sliding window for +* @li strides: A required list or tuple. The stride of the sliding window for * height and width of input "x" of the convolution. * Must be with shape [1, 1, stride_height, stride_width] or [1, stride_height, * stride_width, 1]. * @li dilations: An optional list or tuple. The dilation factor for each -* dimension of input "x". If set to k > 1, there will be k-1 skipped cells -* between each filter element on that dimension. Must be with shape [1, 1, -* dilation_height, dilation_width] or [1, dilation_height, dilation_width, 1]. -* @li pads: An optional list or tuple. Padding added to each dimension of the +* dimension of input "x". +* If set to k > 1, there will be k-1 skipped cells between each filter element +* on that dimension. Must be with shape [1, 1, dilation_height, dilation_width] +* or [1, dilation_height, dilation_width, 1]. +* @li pads: A required list or tuple. Padding added to each dimension of the * input. * @li data_format: An optional string. Input data format, either "NHWC" or * "NCHW". @@ -161,21 +165,22 @@ REG_OP(DepthwiseConv2DBackpropFilterD) * The feature map is 4D with shape [N, C, Hi, Wi] or [N, Hi, Wi, C], but * the data is 5D with shape [N, C1, Hi, Wi, C0], where C0 is 16.\n * The filter is 4D with shape [Hf, Wf, C, K], but the data is 6D with shape -* [C1, Hf, Wf, K, Co, C0], where K is fixed at 1, and Co and C0 are 16.\n +* [C1, Hf, Wf, K, Co, C0], +* where K is fixed at 1, and Co and C0 are 16.\n * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the -* data is 5D with shape [N, C1, Ho, Wo, C0], where C is the same as that of the -* feature map and C0 is 16.\n -* Limited by Tiling: max_h_in_l1 ≥ C0, where max_h_in_l1 = (l1_size - Hf * Wf * -* C0 * C0 * 2) / (2 * Wo * C0).\n +* data is 5D with shape [N, C1, Ho, Wo, C0], +* where C is the same as that of the feature map and C0 is 16.\n +* Limited by Tiling: max_h_in_l1 ?C0, where max_h_in_l1 = (l1_size - Hf * +* Wf * C0 * C0 * 2) / (2 * Wo *C0).\n */ REG_OP(DepthwiseConv2DBackpropInput) .INPUT(input_size, TensorType({DT_INT32, DT_INT64})) .INPUT(filter, TensorType({DT_FLOAT16})) .INPUT(out_backprop, TensorType({DT_FLOAT16})) .OUTPUT(input_grad, TensorType({DT_FLOAT16})) - .ATTR(strides, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(strides, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) - .ATTR(pads, ListInt, {0, 0, 0, 0}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(DepthwiseConv2DBackpropInput) @@ -186,47 +191,49 @@ REG_OP(DepthwiseConv2DBackpropInput) * @par Inputs: * Two inputs include: \n * @li filter: A 4D tensor of type float16, with shape [H, W, C, K] -* @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C], of type -* float16 +* @li out_backprop: 4D tensor with shape [N, C, H, W] or [N, H, W, C], of +* type float16 * @par Attributes: -* @li input_size: An optional list or tuple. The origin shape of input. -* @li strides: An optional list or tuple. The stride of the sliding window for +* @li input_size: A required list or tuple. The origin shape of input. +* @li strides: A required list or tuple. The stride of the sliding window for * height and width of input "x" of the convolution. * Must be with shape [1, 1, stride_height, stride_width] or [1, stride_height, * stride_width, 1]. * @li dilations: An optional list or tuple. The dilation factor for each -* dimension of input "x". If set to k > 1, there will be k-1 skipped cells -* between each filter element on that dimension. Must be with shape [1, 1, -* dilation_height, dilation_width] or [1, dilation_height, dilation_width, 1]. -* @li pads: An optional list or tuple. Padding added to each dimension of the +* dimension of input "x". +* If set to k > 1, there will be k-1 skipped cells between each filter element +* on that dimension. Must be with shape [1, 1, dilation_height, dilation_width] +* or [1, dilation_height, dilation_width, 1]. +* @li pads: A required list or tuple. Padding added to each dimension of the * input. * @li data_format: An optional string. Input data format, either "NHWC" or * "NCHW". * @par Outputs: -* input_grad: Gradient of the deep convolution relative to the input with shape -* [N, C, H, W] or [N, H, W, C]. Must be of type float16. +* input_grad: Gradient of the deep convolution relative to the input with +* shape [N, C, H, W] or [N, H, W, C]. Must be of type float16. * @attention Constraints:\n * The feature map is 4D with shape [N, C, Hi, Wi] or [N, Hi, Wi, C], but * the data is 5D with shape [N, C1, Hi, Wi, C0], where C0 is 16.\n * The filter is 4D with shape [Hf, Wf, C, K], but the data is 6D with shape -* [C1, Hf, Wf, K, Co, C0], where K is fixed at 1, and Co and C0 are 16.\n +* [C1, Hf, Wf, K, Co, C0], +* where K is fixed at 1, and Co and C0 are 16.\n * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the -* data is 5D with shape [N, C1, Ho, Wo, C0], where C is the same as that of the -* feature map and C0 is 16.\n -* Limited by Tiling: max_h_in_l1 ≥ C0, where max_h_in_l1 = (l1_size - Hf * Wf * -* C0 * C0 * 2) / (2 * Wo * C0).\n +* data is 5D with shape [N, C1, Ho, Wo, C0], +* where C is the same as that of the feature map and C0 is 16.\n +* Limited by Tiling: max_h_in_l1 ?C0, where max_h_in_l1 = (l1_size - Hf * +* Wf * C0 * C0 * 2) / (2 * Wo *C0).\n */ REG_OP(DepthwiseConv2DBackpropInputD) .INPUT(filter, TensorType({DT_FLOAT16})) .INPUT(out_backprop, TensorType({DT_FLOAT16})) .OUTPUT(input_grad, TensorType({DT_FLOAT16})) - .ATTR(input_size, ListInt, {1, 1, 1, 1}) - .ATTR(strides, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(input_size, ListInt) + .REQUIRED_ATTR(strides, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) - .ATTR(pads, ListInt, {0, 0, 0, 0}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(DepthwiseConv2DBackpropInputD) @@ -242,15 +249,16 @@ REG_OP(DepthwiseConv2DBackpropInputD) * @li offset_w: An optional float16, used for quantized inference * @par Attributes: -* @li strides: An optional list or tuple. The stride of the sliding window for +* @li strides: A required list or tuple. The stride of the sliding window for * height and width of input "x" of the convolution. * Must be with shape [1, 1, stride_height, stride_width] or [1, stride_height, * stride_width, 1]. * @li dilations: An optional list or tuple. The dilation factor for each -* dimension of input "x". If set to k > 1, there will be k-1 skipped cells -* between each filter element on that dimension. Must be with shape [1, 1, -* dilation_height, dilation_width] or [1, dilation_height, dilation_width, 1]. -* @li pads: An optional list or tuple. Padding added to each dimension of the +* dimension of input "x". +* If set to k > 1, there will be k-1 skipped cells between each filter element +* on that dimension. Must be with shape [1, 1, dilation_height, dilation_width] +* or [1, dilation_height, dilation_width, 1]. +* @li pads: A required list or tuple. Padding added to each dimension of the * input. * @li data_format: An optional string. Input data format, either "NHWC" or * "NCHW". @@ -263,7 +271,8 @@ REG_OP(DepthwiseConv2DBackpropInputD) * The feature map is 4D with shape [N, C, Hi, Wi] or [N, Hi, Wi, C], but * the data is 5D with shape [N, C1, Hi, Wi, C0], where C0 is 16.\n * The filter is 4D with shape [Hf, Wf, C, K], but the data is 6D with shape -* [C1, Hf, Wf, K, Co, C0], where K is fixed at 1, and Co and C0 are 16.\n +* [C1, Hf, Wf, K, Co, C0], +* where K is fixed at 1, and Co and C0 are 16.\n * Limited by the size of L1 buffer memory: \n * (l1_size - filter_h*filter_w*BLOCK_SIZE*BLOCK_SIZE*data_size) // (Wi * * BLOCK_SIZE * data_size) >= (BLOCK_SIZE * strides_h + filter_h - strides_h).\n @@ -274,11 +283,11 @@ REG_OP(DepthwiseConv2D) .OPTIONAL_INPUT(bias, TensorType({DT_INT8})) .OPTIONAL_INPUT(offset_w, TensorType({DT_FLOAT16})) .OUTPUT(y, TensorType({DT_FLOAT16})) - .ATTR(strides, ListInt, {}) - .ATTR(dilations, ListInt, {}) - .ATTR(pads, ListInt, {0, 0, 0, 0}) + .REQUIRED_ATTR(strides, ListInt) + .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(data_format, String, "NHWC") - .ATTR(offset_a, Int, 0) + .ATTR(offset_x, Int, 0) .OP_END_FACTORY_REG(DepthwiseConv2D) REG_OP(Conv2DCCE) @@ -353,9 +362,9 @@ REG_OP(BiasAddGrad) *@brief Computes the gradients of convolution with respect to the input. *@par Inputs: * Three inputs: - * @li input_sizes: A Tensor of type int32. An integer vector representing the shape of input, + * @li input_size: A Tensor of type int32. An integer vector representing the shape of input, * where input is a 4-D tensor [batch, height, width, channels] or [batch, channels, height, width]. - * @li filters: A Tensor. Must be one of the following types: float16. + * @li filter: A Tensor. Must be one of the following types: float16, float32, float64. * 4-D with shape [filter_height, filter_width, in_channels, out_channels] * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width]. * @li out_backprop: A Tensor. Must have the same type as filter. 4-D with shape [batch, out_height, out_width, out_channels] @@ -365,24 +374,28 @@ REG_OP(BiasAddGrad) * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1] + * @li groups: Number of blocked connections from input channels to output channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as filter,and has same format as input_size */ REG_OP(Conv2DBackpropInput) - .INPUT(input_sizes, TensorType({DT_INT32, DT_INT64})) - .INPUT(filters, TensorType{DT_FLOAT16}) - .INPUT(out_backprop, TensorType{DT_FLOAT16}) - .OUTPUT(y, TensorType{DT_FLOAT16}) + .INPUT(input_size, TensorType({DT_INT32})) + .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(out_backprop, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .REQUIRED_ATTR(strides, ListInt) - .ATTR(pads, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .ATTR(groups, Int, 1) + .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(Conv2DBackpropInput) /** *@brief Computes the gradients of convolution with respect to the input. *@par Inputs: * Two inputs: - * @li filters: A Tensor. Types is float16. + * @li filter: A Tensor. Types is float16. * 4-D with shape [filter_height, filter_width, in_channels, out_channels] or [out_channels, filter_height, filter_width, in_channels] * or [out_channels, in_channel, filter_height, filter_width]. * @li out_backprop: A Tensor. Must have the same type as filter. 4-D with shape [batch, out_height, out_width, out_channels] @@ -394,50 +407,54 @@ REG_OP(Conv2DBackpropInput) * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1] + * @li groups: Number of blocked connections from input channels to output channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as filter,4-D tensor [batch, height, width, channels] or [batch, channels, height, width]. */ REG_OP(Conv2DBackpropInputD) - .INPUT(filters, TensorType{DT_FLOAT16}) - .INPUT(out_backprop, TensorType{DT_FLOAT16}) - .OUTPUT(y, TensorType{DT_FLOAT16}) - .REQUIRED_ATTR(input_sizes, ListInt) + .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8})) + .INPUT(out_backprop, TensorType({DT_FLOAT16, DT_INT8})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32})) + .REQUIRED_ATTR(input_size, ListInt) .REQUIRED_ATTR(strides, ListInt) - .ATTR(pads, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .ATTR(groups, Int, 1) + .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(Conv2DBackpropInputD) /** *@brief Computes the Deconvolution with respect to the input. *@par Inputs: - * Two inputs: - * @li x: A Tensor. Must have the same type as "filter". 4D with shape\n - * [batch, out_height, out_width, out_channels]\n - * or [batch, out_channels, out_height, out_width]. Gradients with respect\n + * Three inputs: + * @li x: A Tensor. Must have the same type as "filter". 4D with shape + * [batch, out_height, out_width, out_channels] + * or [batch, out_channels, out_height, out_width]. Gradients with respect * to the output of the convolution. * @li filter: A Tensor of type float16. - * 4D with shape [filter_height, filter_width, in_channels, out_channels],\n - * or [out_channels, filter_height, filter_width, in_channels], \n + * 4D with shape [filter_height, filter_width, in_channels, out_channels], + * or [out_channels, filter_height, filter_width, in_channels], * or [out_channels, in_channel, filter_height, filter_width]. * One optional input: * @li bias: An optional tensor of type int8 *@par Attributes: * Three attributes: - * @li strides: A tuple or list of 2 integers. The stride of the sliding window\n + * @li strides: A tuple or list of 2 integers. The stride of the sliding window * for H/W dimension. - * @li pads: A tuple or list of 4 integers. The [top, bottom, left, right] \n + * @li pads: A tuple or list of 4 integers. The [top, bottom, left, right] * padding on the feature map - * @li dilations: A tuple or list of 4 integers. The dilation factor for each\n + * @li dilations: A tuple or list of 4 integers. The dilation factor for each * dimension of input. Must be [1, 1, 1, 1]. *@par Outputs: - * y: A Tensor. Has the same type as "filter". 4D tensor with shape\n + * y: A Tensor. Has the same type as "filter". 4D tensor with shape * [batch, height, width, channels] or [batch, channels, height, width]. */ REG_OP(Deconvolution) - .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) - .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) - .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) - .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE)) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8})) + .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8})) + .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32})) .ATTR(strides, ListInt, {1, 1, 1, 1}) .ATTR(pads, ListInt, {0, 0, 0, 0}) .ATTR(dilations, ListInt, {1, 1, 1, 1}) @@ -446,9 +463,9 @@ REG_OP(Deconvolution) *@brief Computes the gradients of convolution with respect to the filter *@par Inputs: * Three inputs: - * @li x: A Tensor. Must be one of the following types: float16. + * @li x: A Tensor. Must be one of the following types: float16, float32, float64. * 4-D with shape [batch, in_height, in_width, in_channels] or [batch, in_channels, in_height, in_width]. - * @li filter_sizes: A Tensor of type int32. An integer vector representing the tensor shape of filter, + * @li filter_size: A Tensor of type int32. An integer vector representing the tensor shape of filter, * where filter is a 4-D tensor [filter_height, filter_width, in_channels, out_channels] * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width]. * @li out_backprop: A Tensor. Must have the same type as x. 4-D with shape [batch, out_height, out_width, out_channels] @@ -458,17 +475,21 @@ REG_OP(Deconvolution) * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map. * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1]. + * @li groups: Number of blocked connections from input channels to output channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as x */ REG_OP(Conv2DBackpropFilter) - .INPUT(x, TensorType{DT_FLOAT16}) - .INPUT(filter_sizes, TensorType({DT_INT32, DT_INT64})) - .INPUT(out_backprop, TensorType{DT_FLOAT16}) - .OUTPUT(y, TensorType{DT_FLOAT}) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(filter_size, TensorType({DT_INT32})) + .INPUT(out_backprop, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .REQUIRED_ATTR(strides, ListInt) - .ATTR(pads, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .ATTR(groups, Int, 1) + .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(Conv2DBackpropFilter) /** @@ -481,36 +502,239 @@ REG_OP(Conv2DBackpropFilter) * or [batch, out_channels, out_height, out_width]. Gradients with respect to the output of the convolution. *@par Attributes: * Four attributes: - * @li filter_sizes: A Tensor of type integers. An integer vector representing the tensor shape of filter, + * @li filter_size: A Tensor of type integers. An integer vector representing the tensor shape of filter, * where filter is a 4-D tensor [filter_height, filter_width, in_channels, out_channels] * or [out_channels, filter_height, filter_width, in_channels] or [out_channels, in_channel, filter_height, filter_width]. * @li strides: A tuple/list of 2 integers. The stride of the sliding window for H/W dimension. * @li pads: A tuple/list of 4 integers, [top, bottom, left, right] pads on feature map * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension of input, now only support [1,1,1,1]. + * @li groups: Number of blocked connections from input channels to output channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as x */ REG_OP(Conv2DBackpropFilterD) - .INPUT(x, TensorType{DT_FLOAT16}) - .INPUT(out_backprop, TensorType{DT_FLOAT16}) - .OUTPUT(y, TensorType{DT_FLOAT}) - .REQUIRED_ATTR(filter_sizes, ListInt) + .INPUT(x, TensorType({DT_FLOAT16})) + .INPUT(out_backprop, TensorType({DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .REQUIRED_ATTR(filter_size, ListInt) .REQUIRED_ATTR(strides, ListInt) - .ATTR(pads, ListInt, {1, 1, 1, 1}) + .REQUIRED_ATTR(pads, ListInt) .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .ATTR(groups, Int, 1) + .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(Conv2DBackpropFilterD) +/** +*@brief Computes a 2D convolution given 4D "x" and "filter" tensors. +*@par Inputs: +* @li x: A 4D tensor of input images. +* @li filter: A 4D tensor of filters. +* @li bias: An optional 1D tensor. +* @li offset_w: An optional 1D tensor for quantized convolution. Reserved.\n +* \n +* The input and output tensor attributes are listed as follows: +* @verbatim + Tensor | x | filter | bias | offset_w | y + -----------|---------|---------|---------|----------|-------- + Data Type | float16 | float16 | float16 | _ | float16 + |---------|---------|---------|----------|-------- + | float32 | float32 | float32 | _ | float32 + |---------|---------|---------|----------|-------- + | float64 | float64 | float64 | _ | float64 + |---------|---------|---------|----------|-------- + | int8 | int8 | int32 | int8 | int32 + -----------|---------|---------|---------|----------|-------- + Format | NCHW | NCHW | ND | ND | NCHW + | NHWC | NHWC | | | NHWC + | | HWCN | | | +@endverbatim +* It should be noted that the data types must correspond to each other, but the +* format does not need to. + +*@par Attributes: +* @li strides: A list of 4 integers. Specifying the strides of the +* convolution along the height and width. The dimension order is determined +* by the data format of "x". By default the N and C dimensions are set to 1. +* @li pads: A list of 4 integers. Specifying the top, bottom, left and right +* padding. +* @li dilations: A list of 4 integers. Specifying the dilation rate to use +* for dilated convolution. Has the same dimension order and value as "strides". +* @li groups: Number of blocked connections from input channels to output +* channels. Input channels and output channels must both be divisible by +* "groups". Must be set to 1. +* @li offset_x: An optional integer for quantized convolution. +* @li data_format: An optional string from: "NHWC", "NCHW". Specifying the +* data format of the input and output images. Reserved. + +*@par Outputs: +* @li y: A 4D Tensor of output images. + +*@attention +* @li The parameter scope is listed as follows:\n +* @verbatim + Name | Field | Scope + ------------------|--------------|---------- + Input Image Size | H dimension | [1, 4096] + | W dimension | [1, 4096] + ------------------|--------------|---------- + Filter Size | H dimension | [1, 255] + | W dimension | [1, 255] + ------------------|--------------|---------- + Stride Size | H dimension | [1, 63] + | W dimension | [1, 63] + ------------------|--------------|---------- + Padding Size | top side | [0, 255] + | bottom side | [0, 255] + | left side | [0, 255] + | right side | [0, 255] + ------------------|--------------|---------- + Dilation Size | H dimension | [1, 255] + | W dimension | [1, 255] +@endverbatim + +* @li There are restrictions for certain scenarios: +* @verbatim + Output | Restrictions + ------------------|---------------------------------------------- + W dimension == 1 | HxW(input) == HxW(filter) == 1x1,2x2...11x11. + H dimension == 1 | + ------------------|---------------------------------------------- + W dimension == 1 | Not supported + H dimension != 1 | +@endverbatim +* As shown above, "HxW(input)" indicates the image size after padding and +* "HxW(filter)" indicates the filter size after dilation. + +*/ REG_OP(Conv2D) - .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8})) // the featrue map tensor - .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8})) // the filter tensor - .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32})) // optional 1D bias to be added to the conv2d + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8})) + .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8})) + .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32})) .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8})) - .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32})) // the output tensor - .ATTR(strides, ListInt, {1, 1, 1, 1}) // stride on H\W, format sensitive - .ATTR(pads, ListInt, {0, 0, 0, 0}) // top, bottom, left and right pads on feature map - .ATTR(dilations, ListInt, {1, 1, 1, 1}) // dilation on H\W, format sensitive - .ATTR(offset_a, Int, 0) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32})) + .REQUIRED_ATTR(strides, ListInt) + .REQUIRED_ATTR(pads, ListInt) + .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .ATTR(groups, Int, 1) + .ATTR(data_format, String, "NHWC") + .ATTR(offset_x, Int, 0) .OP_END_FACTORY_REG(Conv2D) +/** +*@brief Computes a 3D convolution given 5D "x" and "filter" tensors. +*@par Inputs: +*@li x: A 5D tensor. Must be one of the following types: float16, float32, float64. The format is NCDHW or NDHWC. +*@li filter: A 5D tensor of the same type as "x". The format is NCDHW, NDHWC or DHWCN. +*@li bias: An optional 1D tensor of the same type as "x". + +*@par Attributes: +*@li strides: A list of 5 ints. Specifies the stride of the sliding window for each dimension of "x". The N and C dimensions must be 1. Has the same format as "x". +*@li pads: A list of 6 ints. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right. +*@li padding_mode: An optional string from: "zeros", "circular". Defaults to "zeros". +*@li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. +*@li dilations: A list of 5 ints. Specifies the dilation factor for each dimension of "x". The N and C dimensions must be 1. Has the same format as "x". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*@attention Constraints:\n +*The image size after padding is greater than the filter size.\n + +*/ +REG_OP(Conv3D) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .ATTR(strides, ListInt, {1, 1, 1, 1, 1}) + .ATTR(pads, ListInt, {0, 0, 0, 0, 0, 0}) + .ATTR(padding_mode, String, "zeros") + .ATTR(data_format, String, "NDHWC") + .ATTR(dilations, ListInt, {1, 1, 1, 1, 1}) + .OP_END_FACTORY_REG(Conv3D) + +/** +*@brief Computes the gradients of convolution 3d with respect to the input. +*@par Inputs: + * Three inputs: + * @li input_sizes: A Tensor of type int32, int64. An integer vector representing the shape of input, + * where input is a 5-D tensor [batch, depth, height, width, channels] or [batch, channels, depth, height, width]. + * @li filters: A Tensor. Must be one of the following types: float16, float32, float64. + * @li grads: A Tensor. Must have the same type as filter. 5-D with shape [batch, depth, out_height, out_width, out_channels] + * or [batch, out_channels, depth, out_height, out_width]. Gradients with respect to the output of the convolution. +*@par Attributes: + * Four attributes: + * @li strides: A tuple/list of 3 integers. The stride of the sliding window for D/H/W dimension. + * @li pads: A tuple/list of 6 integers + * @li dilations: A tuple/list of 6 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1] + * @li data_format: An optional string from: "NDHWC", "NCHWD". Defaults to "NDHWC". Specify the data format of the input and output data. +*@par Outputs: + * y: A Tensor. Has the same type as filter,and has same format as input_size +*/ +REG_OP(Conv3DBackpropInput) + .INPUT(input_sizes, TensorType({DT_INT32, DT_INT64})) + .INPUT(filters, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .REQUIRED_ATTR(strides, ListInt) + .ATTR(pads, ListInt, {0, 0, 0, 0, 0, 0}) + .ATTR(data_format, String, "NDHWC") + .ATTR(dilations, ListInt, {1, 1, 1, 1, 1}) + .OP_END_FACTORY_REG(Conv3DBackpropInput) + +/** +*@brief Computes the gradients of convolution 3d with respect to the input. +*@par Inputs: + * Two inputs: + * @li filters: A Tensor. Types is float16. + * @li grads: A Tensor. Must have the same type as filter. +*@par Attributes: + * Five attributes: + * @li input_sizes A Tensor of type int32. An integer vector representing the shape of input, + * @li strides: A tuple/list of 3 integers. The stride of the sliding window for D/H/W dimension. + * @li pads: A tuple/list of 4 integers + * @li dilations: A tuple/list of 5 integers, The dilation factor for each dimension of input, now only support [1,1,1,1,1] + * @li data_format: An optional string from: "NDHWC", "NCHWD". Defaults to "NDHWC". Specify the data format of the input and output data. +*@par Outputs: + * y: A Tensor. Has the same type as filter +*/ +REG_OP(Conv3DBackpropInputD) + .INPUT(filters, TensorType({DT_FLOAT16})) + .INPUT(grads, TensorType({DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT16})) + .REQUIRED_ATTR(input_sizes, ListInt) + .REQUIRED_ATTR(strides, ListInt) + .ATTR(pads, ListInt, {0, 0, 0, 0, 0, 0}) + .ATTR(data_format, String, "NDHWC") + .ATTR(dilations, ListInt, {1, 1, 1, 1, 1}) + .OP_END_FACTORY_REG(Conv3DBackpropInputD) + +REG_OP(LSTMQuant) + .INPUT(x, TensorType({DT_FLOAT16,DT_INT8})) + .INPUT(cont, TensorType({DT_FLOAT32,DT_FLOAT16})) + .OPTIONAL_INPUT(x_static, TensorType({DT_FLOAT16,DT_INT8})) + .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16,DT_FLOAT32,DT_INT8})) + .OPTIONAL_INPUT(c_0, TensorType({DT_FLOAT16,DT_FLOAT32})) + .INPUT(w_x, TensorType({DT_FLOAT16,DT_INT8})) + .INPUT(bias, TensorType({DT_FLOAT16,DT_FLOAT32,DT_INT16,DT_INT32})) + .OPTIONAL_INPUT(w_x_static, TensorType({DT_FLOAT16,DT_INT8})) + .INPUT(w_h, TensorType({DT_FLOAT16,DT_INT8})) + .OPTIONAL_INPUT(w_xh_deqscale, TensorType({DT_FLOAT16})) + .OPTIONAL_INPUT(w_x_static_deqscale, TensorType({DT_FLOAT16})) + .OUTPUT(h, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8})) + .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8})) + .OUTPUT(c_t, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(num_output, Int, 0) + .ATTR(expose_hidden, Bool, false) + .ATTR(xh_scale, Float,0) + .ATTR(sqrt_mode_xh, Bool, false) + .ATTR(sqrt_mode_x_static, Bool, false) + .ATTR(xh_offset, Int,0) + .ATTR(x_static_scale, Float,0.0) + .ATTR(x_static_offset, Int,0) + .ATTR(w_xh_offset,ListInt,{0}) + .ATTR(w_x_static_offset,ListInt,{0}) + .OP_END_FACTORY_REG(LSTMQuant) } // namespace ge #endif // GE_OP_NN_CALCULATION_OPS_H diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h index 39dd23b1..04cc3028 100644 --- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h @@ -166,16 +166,151 @@ REG_OP(ROIAlignGrad) *output: Outputs the feature sample of each ROI position. The format is 5HD. The axis N is the number of input ROIs. Axes H, W, and C are consistent with the values of "pooled_height", "pooled_width", and "features", respectively. */ REG_OP(ROIAlign) - .INPUT(features, TensorType({DT_FLOAT})) - .INPUT(rois, TensorType({DT_FLOAT})) + .INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT})) .OPTIONAL_INPUT(rois_n, TensorType({DT_INT32})) - .OUTPUT(output, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) .REQUIRED_ATTR(spatial_scale, Float) .REQUIRED_ATTR(pooled_height, Int) .REQUIRED_ATTR(pooled_width, Int) .ATTR(sample_num, Int, 2) + .ATTR(roi_end_mode, Int, 1) .OP_END_FACTORY_REG(ROIAlign) +/** +*@brief Performs SSD prior box detection. + +*@par Inputs: +* Two inputs, including: +*@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16. +*@li img: source image. Has the same type and format as "x". + +*@par Attributes: +*@li min_size: A required float32, specifying the minimum edge length of a square prior box. +*@li max_size: A required float32, specifying the maximum edge length of a square prior box: sqrt(min_size * max_size) +*@li aspect_ratio: An required float32, specifying the aspect ratio for generated rectangle boxes. The height is min_size/sqrt(aspect_ratio), the width is min_size*sqrt(aspect_ratio). Defaults to "1.0". +*@li img_h: An optional int32, specifying the source image height. Defaults to "0". +*@li img_w: An optional int32, specifying the source image width. Defaults to "0". +*@li step_h: An optional float32, specifying the height step for mapping the center point from the feature map to the source image. Defaults to "0.0". +*@li step_w: An optional float32, specifying the width step for mapping the center point from the feature map to the source image. Defaults to "0.0". +*@li flip: An optional bool. If "True", "aspect_ratio" will be flipped. Defaults to "True". +*@li clip: An optional bool. If "True", a prior box is clipped to within [0, 1]. Defaults to "False". +*@li offset: An optional float32, specifying the offset. Defaults to "0.5". +*@li variance: An optional float32, specifying the variance of a prior box, either one or four variances. Defaults to "0.1" (one value). + +*@par Outputs: +*y: An ND tensor of type float32 or float16, specifying the prior box information, including its coordinates and variance. + +*@attention Constraints:\n +* This operator applies only to SSD networks. +*@see SSDDetectionOutput() +*/ + REG_OP(PriorBox) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .REQUIRED_ATTR(min_size, ListFloat) + .REQUIRED_ATTR(max_size, ListFloat) + .REQUIRED_ATTR(aspect_ratio, ListFloat) + .ATTR(img_h, Int, 0) + .ATTR(img_w, Int, 0) + .ATTR(step_h, Float, 0.0) + .ATTR(step_w, Float, 0.0) + .ATTR(flip, Bool, true) + .ATTR(clip, Bool, false) + .ATTR(offset, Float, 0.5) + .ATTR(variance, ListFloat, {0.1}) + .OP_END_FACTORY_REG(PriorBox); + +/** +*@brief Performs SSD prior box detection, with four additional matrices and the "aspect_ratio" attribute deleted compared to PriorBox. + +*@par Inputs: +* Six inputs, including: +*@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16. +*@li img: source image. Has the same type and format as "x". +*@li data_h: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map height. +*@li data_w: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map width. +*@li box_height: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the height of each prior box. +*@li box_width: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the width of each prior box. + +*@par Attributes: +*@li min_size: A required float32, specifying the minimum edge length of a square prior box. +*@li max_size: A required float32, specifying the maximum edge length of a square prior box: sqrt(min_size * max_size) +*@li img_h: An optional int32, specifying the height of the source image. +*@li img_w: An optional int32, specifying the width of the source image. +*@li step_h: An optional float32, specifying the height step for mapping the center point from the feature map to the source image. +*@li step_w: An optional float32, specifying the width step for mapping the center point from the feature map to the source image. +*@li flip: An optional bool. If "True", "aspect_ratio" will be flipped. Defaults to "True". +*@li clip: An optional bool. If "True", a prior box is clipped to within [0, 1]. Defaults to "False". +*@li offset: An optional float32, specifying the offset. Defaults to "0.5". +*@li variance: An optional float32, specifying the variance of a prior box, either one or four variances. Defaults to "0.1" (one value). + +*@par Outputs: +*y: An ND tensor of type float32 or float16, specifying the prior box information, including its coordinates and variance. + +*@attention Constraints:\n +* This operator applies only to SSD networks. +*@see SSDDetectionOutput() +*/ + REG_OP(PriorBoxD) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(data_h, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(data_w, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(box_height, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(box_width, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .REQUIRED_ATTR(min_size, ListFloat) + .REQUIRED_ATTR(max_size, ListFloat) + .ATTR(img_h, Int, 0) + .ATTR(img_w, Int, 0) + .ATTR(step_h, Float, 0.0) + .ATTR(step_w, Float, 0.0) + .ATTR(flip, Bool, true) + .ATTR(clip, Bool, false) + .ATTR(offset, Float, 0.5) + .ATTR(variance, ListFloat, {0.1}) + .OP_END_FACTORY_REG(PriorBoxD); + +/** +*@brief Performs Position Sensitive ROI Pooling. + +*@par Inputs: +* Two inputs, including: +*@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature +* map, dimension C1 must be equal to +* (int(output_dim+15)/C0))*group_size*group_size. +*@li rois: A tensor of type float16 or float32, with shape +* [batch, 5, rois_num], describing the ROIs, each ROI consists of five +* elements: "batch_id", "x1", "y1", "x2", and "y2", which "batch_id" indicates +* the index of the input feature map, "x1", "y1", "x2", or "y2" must be +* greater than or equal to "0.0". + +*@par Attributes: +*@li output_dim: A required int32, specifying the number of output channels, +* must be greater than 0. +*@li group_size: A required int32, specifying the number of groups to encode +* position-sensitive score maps, must be within the range (0, 128). +*@li spatial_scale: A required scaling factor for mapping the input +* coordinates to the ROI coordinates. + +*@par Outputs: +*y: An NC1HWC0 tensor of type float16 or float32, describing the result +* feature map. + +*@attention Constraints: +* HC1HWC0: channel must be Group_size squared, rois_num is a multiple of 16 +*/ +REG_OP(PSROIPooling) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(output_dim, Int, 0) + .ATTR(group_size, Int, 0) + .ATTR(spatial_scale, Float, 0.0625) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .OP_END_FACTORY_REG(PSROIPooling) + } // namespace ge #endif // GE_OP_NN_DETECT_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h index 8dec2dc3..9b82f565 100644 --- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h @@ -92,12 +92,36 @@ REG_OP(SoftmaxCrossEntropyWithLogits) .OUTPUT(backprop, TensorType({DT_DOUBLE,DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(SoftmaxCrossEntropyWithLogits) +/** +*@brief Computes gradients for a softmax operation. + +*@par Inputs: +* Two inputs, including: \n +* @li softmax: Output of the softmax operator. Must be one of the following types: float16, float31, int32, int8, uint8. The format is NC1HWC0 or DN. +* @li grad_softmax: A Tensor. Has the same shape and type as "softmax". The format is NC1HWC0 or DN. + +*@par Outputs: +*grad_x: A Tensor. Has the same shape and type as "softmax". + +*/ REG_OP(SoftmaxGrad) .INPUT(softmax, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .INPUT(grad_softmax, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .OUTPUT(grad_x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8})) .OP_END_FACTORY_REG(SoftmaxGrad) +/** +*@brief Computes the sigmoid cross entropy loss of "predict" and "target". + +*@par Inputs: +* Two inputs, including: \n +*@li predict: A multi-dimensional Tensor of type float16 or float32, specifying the predictive value. +*@li target: A multi-dimensional Tensor of type float16 or float32, specifying the target value. + +*@par Outputs: +*loss: Sigmoid cross entropy between the predictive value and target value. Has the same dimensions as "predict". + +*/ REG_OP(SigmoidCrossEntropyWithLogitsGrad) .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(target, TensorType({DT_FLOAT16, DT_FLOAT})) @@ -105,12 +129,43 @@ REG_OP(SigmoidCrossEntropyWithLogitsGrad) .OUTPUT(gradient, TensorType({DT_FLOAT16, DT_FLOAT})) .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogitsGrad) +/** +*@brief Performs the backpropagation of SigmoidCrossEntropyWithLogits for training scenarios. + +*@par Inputs: +* Three inputs, including: \n +*@li predict: A multi-dimensional Tensor of type float16 or float32, specifying the predictive value. +*@li target: A multi-dimensional Tensor of type float16 or float32, specifying the target value. +*@li dout: A multi-dimensional Tensor of float16 or float32, specifying the gradient transferred from the upper layer. + +*@par Outputs: \n +*gradient: Return gradient. Has the same dimensions and type as "predict". + +*/ REG_OP(SigmoidCrossEntropyWithLogits) .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(target, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(loss, TensorType({DT_FLOAT16, DT_FLOAT})) .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogits) +/** +*@brief Computes the regression box of the RPN. It is a FasterRCNN operator. + +*@par Inputs: +* Two inputs, including: \n +*@li predict: A multi-dimensional Tensor of type float16 or float32, specifying the predictive value. +*@li label: A multi-dimensional Tensor of type float16 or float32, specifying the target value. + +*@par Attributes: +* sigma: Must be a floating point number. Defaults to "1.0". + +*@par Outputs: +*loss: Indicates the loss between the predictive value and target value. Has the same dimensions as "predict". + +*@attention Constraints: +* This operator does not perform the "reduce" operation on the loss value. Call other reduce operators to perform "reduce" operation on the loss if required. + +*/ REG_OP(SmoothL1Loss) .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(label, TensorType({DT_FLOAT16, DT_FLOAT})) @@ -118,6 +173,22 @@ REG_OP(SmoothL1Loss) .ATTR(sigma, Float, 1.0) .OP_END_FACTORY_REG(SmoothL1Loss) +/** +*@brief Performs the backpropagation of SmoothL1Loss for training scenarios. + +*@par Inputs: +* Three inputs, including: \n +*@li predict: A multi-dimensional Tensor of type float16 or float32, specifying the predictive value. +*@li label: A multi-dimensional Tensor of float16 or float32, specifying the target value. +*@li dout: A multi-dimensional Tensor of float16 or float32, specifying the gradient transferred from the upper layer. + +*@par Attributes: +* sigma: Must be a floating point number. Defaults to "1.0". + +*@par Outputs: +*gradient: Return gradient. Has the same dimensions and type as "predict". + +*/ REG_OP(SmoothL1LossGrad) .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(label, TensorType({DT_FLOAT16, DT_FLOAT})) @@ -126,6 +197,26 @@ REG_OP(SmoothL1LossGrad) .ATTR(sigma, Float, 1.0) .OP_END_FACTORY_REG(SmoothL1LossGrad) +/** +*@brief Creates a criterion that measures the Binary Cross Entropy between the target and the output. + +*@par Inputs: +* Three inputs, including: \n +*@li x: A 1D or 2D Tensor of type float16 or float32, specifying a predictive value. +*@li y: A 1D or 2D Tensor of type float16 or float32, indicating a tag. +*@li weight: An optional 1D or 2D Tensor, specifying the weight. + +*@par Attributes: +*reduction: A character string from "none", "mean", and "sum", specifying the reduction type to be applied to the output. Defaults to "mean". + +*@par Outputs: +*output: Output loss. Has the same dimension with the inputs. When "reduction" is set to "none", a Tensor with the same size as "x" is output. Otherwise, a Scalar is output. + +*@attention Constraints: +*@li The value of "x" must range from 0 to 1. +*@li The value of "y" must be "0" or "1". + +*/ REG_OP(BinaryCrossEntropy) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) .INPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) @@ -134,6 +225,27 @@ REG_OP(BinaryCrossEntropy) .ATTR(reduction, String, "mean") .OP_END_FACTORY_REG(BinaryCrossEntropy) +/** +*@brief Performs the backpropagation of BinaryCrossEntropy for training scenarios. + +*@par Inputs: +* Four inputs, including: \n +*@li x: A 1D or 2D Tensor of type float16 or float32, specifying a predictive value. +*@li y: A 1D or 2D Tensor of type float16 or float32, indicating a tag. +*@li grad_output: A 1D or 2D Tensor of type float16 or float32, specifying the backpropagation gradient. +*@li weight: An optional 1D or 2D Tensor, specifying the weight. + +*@par Attributes: \n +*reduction: A character string from "none", "mean", and "sum", specifying the gradient output mode. Defaults to "mean". + +*@par Outputs: \n +*output: A 1D or 2D Tensor. When "reduction" is set to "none", a Tensor with the same size as "x" is output. Otherwise, a Scalar is output. + +*@attention Constraints: +*@li The value of "x" must range from 0 to 1. +*@li The value of "y" must be "0" or "1". + +*/ REG_OP(BinaryCrossEntropyGrad) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) .INPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) @@ -153,16 +265,16 @@ that the elements of the n-dimensional output Tensor lie in the range [0,1] and *float32, double. Should be a Variable Tensor. *@par Attributes: -*axis: A list of ints. The dimension softmax would be performed on. +*axes: A list of ints. The dimension softmax would be performed on. *@par Outputs: *y: A Tensor. Has the same dimensionality and shape as the "x" with values in the range [0, 1]. Must be one of the following types: float16, float32, int32. */ -REG_OP(Softmax) +REG_OP(SoftmaxV2) .INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT16, DT_FLOAT})) .OUTPUT(y, TensorType({DT_DOUBLE, DT_FLOAT16, DT_FLOAT})) - .ATTR(axis, ListInt, {-1}) - .OP_END_FACTORY_REG(Softmax) + .ATTR(axes, ListInt, {-1}) + .OP_END_FACTORY_REG(SoftmaxV2) /** *@brief Computes log softmax activations. @@ -172,16 +284,16 @@ REG_OP(Softmax) * logits: A Tensor. Must be one of the following types: double, float16, float32. *@par Attributes: -* axis: An optional list of ints. Defaults to "{-1}". +* axes: An optional list of ints. Defaults to "{-1}". *@par Outputs: * logsoftmax: A Tensor. Has the same type as "logits". */ -REG_OP(LogSoftmax) +REG_OP(LogSoftmaxV2) .INPUT(logits, TensorType({DT_DOUBLE, DT_FLOAT16, DT_FLOAT})) .OUTPUT(logsoftmax, TensorType({DT_DOUBLE, DT_FLOAT16, DT_FLOAT})) - .ATTR(axis, ListInt, {-1}) - .OP_END_FACTORY_REG(LogSoftmax) + .ATTR(axes, ListInt, {-1}) + .OP_END_FACTORY_REG(LogSoftmaxV2) REG_OP(FusedBatchNormV2) .INPUT(x, TensorType{DT_FLOAT}) /* Input data tensor from the previous operator"" */ @@ -198,26 +310,6 @@ REG_OP(FusedBatchNormV2) .ATTR(beta, Float, 0) .OP_END_FACTORY_REG(FusedBatchNormV2) -REG_OP(Scale) - .INPUT(x, TensorType{DT_FLOAT}) - .OPTIONAL_INPUT(w, TensorType{DT_FLOAT}) - .OPTIONAL_INPUT(b, TensorType{DT_FLOAT}) - .OUTPUT(y, TensorType{DT_FLOAT}) - .ATTR(bias_term, Bool, false) - .ATTR(axis, Int, 1) - .ATTR(num_axis, Int, 1) - .ATTR(alpha, Float, 1.0) - .ATTR(beta, Float, 0.0) - .OP_END_FACTORY_REG(Scale) - -REG_OP(SoftmaxGradExt) - .INPUT(grad, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(x1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(x2, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) - .ATTR(axis, ListInt, {-1}) - .ATTR(keep_dims, Bool, false) - .OP_END_FACTORY_REG(SoftmaxGradExt) /** *@brief Confuse mul, sum and sub. @@ -237,6 +329,77 @@ REG_OP(ConfusionSoftmaxGrad) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(ConfusionSoftmaxGrad) +REG_OP(SoftmaxGradExt) + .INPUT(grad, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(x1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(x2, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) + .ATTR(axes, Int, 1) + .ATTR(keep_dims, Bool, false) + .OP_END_FACTORY_REG(SoftmaxGradExt) + +/** +*@brief Normalizes the input. + +*@par Inputs: +* One input: +*x: An NCHW tensor of type float16 or float32. + +*@par Attributes: +*@li normalize_variance: An optional bool specifying whether to normalize the variance, either "true" (default) or "false" +* the value "false" indicates only to subtract the mean. +*@li across_channels: An optional bool specifying whether to perform across-channel MVN, either "true" or "false" (default) +* The value "true" indicates "CHW" is treated as a vector. +*@li eps: An optional float32 epsilon for not dividing by zero. Defaults to "1e-9". + +*@par Outputs: +*y: An NCHW tensor of type float16 or float32. + +*@attention Constraints:\n +* The input tensor must have the NCHW format, whose shape length must be 4. +*/ + +REG_OP(MVN) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) /* "First operand." */ + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) /* "Result, has same element type as inputs" */ + .ATTR(normalize_variance, Bool, true) + .ATTR(across_channels, Bool, false) + .ATTR(eps, Float, 1e-9) + .OP_END_FACTORY_REG(MVN) + +/** +*@brief Normalizes the input "x1". + +*@par Inputs: +* Two inputs, including: +*@li x1: A required NCHW or NHWC tensor of type float32, float16, or int8. +*@li x2: A required ND tensor of type float32, float16, or int8, specifying +* the scaling factor. If "channel_shared" is "true", "x2" is a [1]-dimensional +* vector. If "channel_shared" is "false", "x2" is a [C]-dimensional vector. + +*@par Attributes: +*@li across_spatial: An optional bool, specifying the dimension of input "x1" +* to be summed. The value "true" (default) indicates dimensions C, H, W, and +* the value "false" indicates dimension C. +*@li channel_shared: An optional bool, specifying the dimension count of input +* "x2". The value "true" (default) indicates 1, and the value "false" indicates +* dimension C of "x1". +*@li eps: An optional float32, specifying the bias when "across_spatial" is +* "true". Defaults to "1e-10". + +*@par Outputs: +*y: A Tensor. Has the same type and format as "x1". + +*/ +REG_OP(Normalize) + .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8})) + .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8})) + .ATTR(across_spatial, Bool, true) + .ATTR(channel_shared, Bool, true) + .ATTR(eps, Float, 1e-10) + .OP_END_FACTORY_REG(Normalize); + /** *@brief Layernorm operator interface implementation * calculating: x, gamma, beta @@ -421,6 +584,43 @@ REG_OP(DropOutDoMask) .INPUT(keep_prob, TensorType({DT_FLOAT, DT_FLOAT16})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) .OP_END_FACTORY_REG(DropOutDoMask) + +/** +*@brief Scales the input. + +*@par Inputs: +* Three inputs, including: +*@li x: An ND tensor of type float16 or float32. +*@li scale: An ND tensor of type float16 or float32. +*@li bias: An ND tensor of type float16 or float32. + +*@par Attributes: +*@li axis: An optional int32 used to compute the shape of scale and bias input from the online bottoms. Defaults to "1". + +*@par Outputs: +*y: An ND tensor of type float16 or float32. + +*@attention Constraints:\n +* Assume that the shape length of "x" is "n" and that of "scale" is "m". +*@li "axis" is within the range [-n, n-1]. num_axes >= -1. +*@li If "scale_from_blob = true", "num_axes = -1", and "axis >= 0", the ith axis of "scale" and the (i+"axis")th axis of "x" must have the same size (0 <= i < n-axis).\n +* If "axis < 0", the ith axis of "scale" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < -axis). +*@li If "scale_from_blob = true" and "num_axes = 0", "scale" is a scalar with shape length 1 and dimension size 1. +*@li If "scale_from_blob = true", "num_axes > 0, and "axis >= 0", "axis + num_axes" must be less than or equal to "n" and the ith axis of "scale" and the (i+"axis")th axis of "x" must have the same size (0 <= i < num_axes).\n +* If "axis < 0", "n + axis + num_axes" must be less than or equal to "n" and the ith axis of "scale" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < num_axes). +*@li If "scale_from_blob = false", "scale" is not a scalar, and "axis >= 0","axis + m" must be less than or equal to "n" and the ith axis of "scale" and the (i+"axis")th axis of "x" must have the same size (0 <= i < m).\n +* If "axis < 0", "n + axis + m" must be less than or equal to "n" and the ith axis of "scale" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < m). +*@li If "bias" is not None, the constraints for "bias" is the same as that for "scale". +*/ +REG_OP(Scale) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) /* "First operand." */ + .INPUT(scale, TensorType({DT_FLOAT, DT_FLOAT16})) /* "Second operand." */ + .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT, DT_FLOAT16})) /* "Third operand." */ + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) /* "Result, has same element type as x" */ + .ATTR(axis, Int, 1) + .ATTR(num_axes, Int, 1) + .ATTR(scale_from_blob, Bool, true) + .OP_END_FACTORY_REG(Scale) /** *@brief Local Response Normalization. @@ -430,17 +630,16 @@ REG_OP(DropOutDoMask) *@li x: A Tensor. Must be 4-D shape, and only support the following types: float16, float32. *@par Attributes: -*@li depth_radius: An optional int, specifying the half-width of the -* normalization window. Defaults to "5". +* depth_radius = (local_size + 1) / 2. Defaults to "5". *@li bias: An optional float32. An offset, usually > 0 to avoid dividing by 0. * Defaults to "1". *@li alpha: An optional float32. A scaling factor, usually positive. * Defaults to "1". -*@li beta: An optional float32. An exponent. Defaults to "0.5". -*@li norm_region: An optional string. A mode option. Defaults to "ACROSS_CHANNELS". +*@li norm_region: An optional string. A mode option. "ACROSS_CHANNELS":0, "WITHIN_CHANNEL":1. Defaults to "ACROSS_CHANNELS". *@par Outputs: *y: A Tensor. Has the same data type and shape as "x". + */ REG_OP(LRN) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) @@ -474,6 +673,7 @@ REG_OP(LRN) * @attention Constraints: * "x" and "y" must have the same shape and type as "grads". + */ REG_OP(LRNGrad) .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/nn_ops.h b/third_party/fwkacllib/inc/ops/nn_ops.h index b76466e9..3fd6d74b 100644 --- a/third_party/fwkacllib/inc/ops/nn_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_ops.h @@ -22,6 +22,28 @@ namespace ge { +/** +*@brief Computes gradient of the FractionalMaxPool function. + +*@par Inputs: +*Inputs include: \n +* @li orig_input: A Tensor. Must be one of the following types: float32, float64, int32, int64. +* @li orig_output: A Tensor. Must have the same type as orig_input. +* @li out_backprop: A Tensor. Must have the same type as orig_input. \n + 4-D with shape [batch, height, width, channels]. +* @li row_pooling_sequence: A Tensor of type int64. +* @li col_pooling_sequence: A Tensor of type int64. + +*@par Attributes: +*overlapping: An optional bool. Defaults to False. + +*@par Outputs: +*y: A Tensor. Has the same type as orig_input. + +*@attention Constraints:\n +*-The implementation for FractionalMaxPoolGrad on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(FractionalMaxPoolGrad) .INPUT(orig_input, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) .INPUT(orig_output, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) @@ -32,6 +54,31 @@ REG_OP(FractionalMaxPoolGrad) .ATTR(overlapping, Bool, false) .OP_END_FACTORY_REG(FractionalMaxPoolGrad) +/** +*@brief Performs fractional average pooling on the input. + +*@par Inputs: +*Inputs include: \n +*x: A Tensor. Must be one of the following types: float32, float64, int32, int64. \n + 4-D with shape [batch, height, width, channels]. + +*@par Attributes: +*@li pooling_ratio: A list of floats that has length >= 4. +*@li pseudo_random: An optional bool. Defaults to False. +*@li overlapping: An optional bool. Defaults to False. When set to True, it means when pooling. +*@li deterministic: An optional bool. Defaults to False. +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*@li y: A Tensor. Has the same type as x. +*@li row_pooling_sequence: A Tensor of type int64. +*@li col_pooling_sequence: A Tensor of type int64. + +*@attention Constraints:\n +*-The implementation for FractionalAvgPool on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(FractionalAvgPool) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) @@ -45,6 +92,31 @@ REG_OP(FractionalAvgPool) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(FractionalAvgPool) +/** +*@brief Performs fractional max pooling on the input. + +*@par Inputs: +*Inputs include: \n +*x: A Tensor. Must be one of the following types: float32, float64, int32, int64. \n + 4-D with shape [batch, height, width, channels]. + +*@par Attributes: +*@li pooling_ratio: A list of floats that has length >= 4. Pooling ratio for each dimension of value. +*@li pseudo_random: An optional bool. Defaults to False. +*@li overlapping: An optional bool. Defaults to False. +*@li deterministic: An optional bool. Defaults to False. +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*@li y: A Tensor. Has the same type as x. +*@li row_pooling_sequence: A Tensor of type int64. +*@li col_pooling_sequence: A Tensor of type int64. + +*@attention Constraints:\n +*-The implementation for FractionalMaxPool on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(FractionalMaxPool) .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) @@ -58,6 +130,25 @@ REG_OP(FractionalMaxPool) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(FractionalMaxPool) +/** +*@brief Finds values of the n-th order statistic for the last dimension. + +*@par Inputs: +*Inputs include: \n +* @li x: A Tensor. Must be one of the following types: float32, float64, int32, uint8, \n + int16, int8, int64, bfloat16, uint16, half, uint32, uint64. +* @li n: A Tensor of type int32. 0-D. + +*@par Attributes: +*reverse: An optional bool. Defaults to False. + +*@par Outputs: +*y: A Tensor. Has the same type as x. + +*@attention Constraints:\n +*-The implementation for NthElement on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(NthElement) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) @@ -67,6 +158,27 @@ REG_OP(NthElement) .ATTR(reverse, Bool, false) .OP_END_FACTORY_REG(NthElement) +/** +*@brief Computes gradient of the FractionalAvgPool function. + +*@par Inputs: +*Inputs include: \n +* @li orig_input_tensor_shape: A Tensor of type int64. +* @li out_backprop: A Tensor. Must be one of the following types: float32, float64, \n + int32, int64. 4-D with shape [batch, height, width, channels]. +* @li row_pooling_sequence: A Tensor of type int64. +* @li col_pooling_sequence: A Tensor of type int64. + +*@par Attributes: +*overlapping: An optional bool. Defaults to False. + +*@par Outputs: +*y: A Tensor. Has the same type as out_backprop. + +*@attention Constraints:\n +*-The implementation for FractionalAvgPoolGrad on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(FractionalAvgPoolGrad) .INPUT(orig_input_tensor_shape, TensorType({DT_INT64})) .INPUT(out_backprop, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) @@ -76,6 +188,25 @@ REG_OP(FractionalAvgPoolGrad) .ATTR(overlapping, Bool, false) .OP_END_FACTORY_REG(FractionalAvgPoolGrad) +/** +*@brief Returns the permuted vector/tensor in the destination data format given the. + +*@par Inputs: +*Inputs include: \n +*x: A Tensor. Must be one of the following types: int32, int64. Vector of size 4 \n + or Tensor of shape (4, 2) in source data format. + +*@par Attributes: +*@li src_format: An optional string. Defaults to "NHWC". source data format. +*@li dst_format: An optional string. Defaults to "NCHW". destination data format. + +*@par Outputs: +*y: A Tensor. Has the same type as x. + +*@attention Constraints:\n +*-The implementation for DataFormatVecPermute on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(DataFormatVecPermute) .INPUT(x, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(y, TensorType({ DT_INT32, DT_INT64 })) diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h index e1fb8558..10f3f369 100644 --- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h @@ -18,6 +18,9 @@ #define GE_OP_NN_POOLING_OPS_H #include "../graph/operator_reg.h" + +namespace ge { + /** *@brief Performs pooling on the input. *@par Inputs: @@ -36,13 +39,18 @@ *pad[1]: An optional int32, specifying the bottom padding. Defaults to "0". \n *pad[2]: An optional int32, specifying the left padding. Defaults to "0". \n *pad[3]: An optional int32, specifying the right padding. Defaults to "0". \n +*@li dilation: Optional, including: \n +*dilation[0]: An optional int32, specifying the up dilation. Defaults to "1". \n +*dilation[1]: An optional int32, specifying the bottom dilation. Defaults to "1". \n +*dilation[2]: An optional int32, specifying the left dilation. Defaults to "1". \n +*dilation[3]: An optional int32, specifying the right dilation. Defaults to "1". \n *@li ceil_mode: An optional int32, either "0" (ceil mode) or "1" (floor mode). Defaults to "0". *@par Outputs: *y: An NCHW tensor of type float16. *@attention Constraints:\n *@li window[0] * window[1] < 256; +*@li 1<=input_h<=4096,1<=input_w<=4096 */ -namespace ge { REG_OP(Pooling) .INPUT(x, TensorType({DT_FLOAT16})) .OUTPUT(y, TensorType({DT_FLOAT16})) @@ -51,6 +59,7 @@ REG_OP(Pooling) .ATTR(window, ListInt, {1,1}) // kernel size .ATTR(stride, ListInt, {1,1}) // stride size .ATTR(pad, ListInt, {0,0,0,0}) // pad size + .ATTR(dilation, ListInt, {1,1,1,1}) .ATTR(ceil_mode, Int, 0) .OP_END_FACTORY_REG(Pooling) @@ -84,6 +93,29 @@ REG_OP(AvgPool) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(AvgPool) +/** +*@brief Performs max_pool_ext2 on the input. + +*@par Inputs: +* One input: +*x: An NC1HWC0 Tensor of type float16. + + +*@par Attributes: +*@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for each dimension of the input tensor. No default value. +*@li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for each dimension of the input tensor. No default value. +*@li padding: A required string. No default value. +*@li data_format: An optional string. Defaults to "NC1HWC0". + +*@par Outputs: +*y: A Tensor. Has the same type and format as input "x". + +*@attention Constraints: +*@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255. +*@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1. +*@li "padding" is either "SAME" or "VALID". + +*/ REG_OP(MaxPoolExt2) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, @@ -97,6 +129,30 @@ REG_OP(MaxPoolExt2) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(MaxPoolExt2) +/** +*@brief Performs max pooling on the input. + +*@par Inputs: +* One input: +*x: An NC1HWC0 Tensor of type float16. + + +*@par Attributes: +*@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for each dimension of the input tensor. No default value. +*@li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for each dimension of the input tensor. No default value. +*@li padding: A required string. No default value. +*@li data_format: An optional string. Defaults to "NC1HWC0". + +*@par Outputs: +*y: A Tensor. Has the same type and format as input "x". + +*@attention Constraints: +*@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255. +*@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1. +*@li "padding" is either "SAME" or "VALID". + + +*/ REG_OP(MaxPool) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, @@ -109,6 +165,15 @@ REG_OP(MaxPool) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(MaxPool) +REG_OP(MaxPool3D) + .INPUT(x, TensorType({DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT16})) + .REQUIRED_ATTR(ksize, ListInt) + .REQUIRED_ATTR(strides, ListInt) + .REQUIRED_ATTR(padding, String) + .ATTR(data_format, String, "NDHWC") + .OP_END_FACTORY_REG(MaxPool3D) + /** * @brief Computes gradients of the maxpooling function. @@ -122,7 +187,10 @@ REG_OP(MaxPool) * each dimension of the input tensor. * @li strides: A required tuple or list, specifying the stride of the sliding * window for each dimension of the input tensor. -* @li padding: A required string, specifying the type of padding algorithm to use. +* @li padding: A required string, specifying the type of padding algorithm +* to use. +* @li data_format: An optional string, Specify the data format of the input and +* output data. With the default format "NHWC". * @par Outputs: * y: A mutable tensor. Has the same shape and type as "x1". @@ -140,6 +208,7 @@ REG_OP(MaxPoolGrad) .REQUIRED_ATTR(ksize, ListInt) .REQUIRED_ATTR(strides, ListInt) .REQUIRED_ATTR(padding, String) + .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(MaxPoolGrad) /** @@ -151,13 +220,13 @@ REG_OP(MaxPoolGrad) * @li grad: Gradient tensor of type float16 * @par Attributes: -* @li ksize: A required list or tuple, specifying the size of the sliding window. +* @li ksize: A required list or tuple, +* specifying the size of the sliding window. * @li strides: A required list or tuple, * specifying the stride of the sliding window. * @li padding: A required string, window sliding mode. Either SAME or VALID. -* @li data_format: An optional string. Format of the original input, -* either NCHW or NHWC. Defaults -* to NHWC. +* @li data_format: An optional string. +* Format of the original input, either NCHW or NHWC. Defaults to NHWC. * @attention Constraints: * @li Only the Ascend 910 platform is supported. @@ -210,6 +279,28 @@ REG_OP(MaxPoolV2) .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(MaxPoolV2) +/** +*@brief Performs max pooling on the input and outputs both max values and indices. + +*@par Inputs: +* One input: +*x: An NC1HWC0 Tensor of type float16. + + +*@par Attributes: +*@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for each dimension of the input tensor. No default value. +*@li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for each dimension of the input tensor. No default value. +*@li padding: A required string. No default value. + +*@par Outputs: +*y: A Tensor. Has the same type and format as input "x". + +*@attention Constraints: +*@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255. +*@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1. +*@li "padding" is either "SAME" or "VALID". + +*/ REG_OP(MaxPoolWithArgmax) .INPUT(x, TensorType::RealNumberType()) .OUTPUT(y, TensorType::RealNumberType()) @@ -220,6 +311,31 @@ REG_OP(MaxPoolWithArgmax) .ATTR(Targmax, Int, 7) .OP_END_FACTORY_REG(MaxPoolWithArgmax) +/** +*@brief Performs the backpropagation of MaxPoolWithArgmax. + +*@par Inputs: +* Three inputs, including: +*@li x: An NC1HWC0 tensor of type float16. +*@li grad: An NC1HWC0 tensor of type float16. +*@li argmx: An NC1HWC0 tensor of type uint16 or int64. + +*@par Attributes: +*@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for each dimension of the input tensor. No default value. +*@li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for each dimension of the input tensor. No default value. +*@li padding: A required string. No default value. + +*@par Outputs: +*y: A Tensor. Has the same type and format as input "x". + +*@attention Constraints: +*@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255. +*@li "strides" is a list that has length 4: strides[0] = 1 or strides[3] = 1 +*@li "padding" is either "SAME" or "VALID". + + +*@see max_pool_with_argmax +*/ REG_OP(MaxPoolGradWithArgmax) .INPUT(x, TensorType::RealNumberType()) .INPUT(grad, TensorType::RealNumberType()) @@ -275,8 +391,8 @@ REG_OP(MaxPoolGradGradWithArgmax) * each dimension of the input tensor. * @li strides: A required tuple or list, specifying the stride of the sliding * window for each dimension of the input tensor. -* @li padding: A required string, specifying the type of the padding algorithm -* to use. +* @li padding: A required string, specifying the type of +* the padding algorithm to use. * @li data_format: An optional string. Defaults to "NHWC". * @par Outputs: @@ -299,11 +415,11 @@ REG_OP(AvgPoolGrad) * @input_grad: An NHWC tensor of type float16, float32, or double. * @par Attributes: -* @li orig_input_shape: Original input dimensions. -* @li ksize: A required tuple or list, specifying the size of the window for -* each dimension of the input tensor. -* @li strides: A required tuple or list, specifying the stride of the sliding -* window for each dimension of the input tensor. +* @li orig_input_shape: A required Original input dimensions. +* @li ksize: A required tuple or list, specifying the size of the window +* for each dimension of the input tensor. +* @li strides: A required tuple or list, specifying the stride of +* the sliding window for each dimension of the input tensor. * @li padding: A required string, specifying the type of the padding algorithm * to use. * @li data_format: An optional string. Defaults to "NHWC". @@ -351,7 +467,54 @@ REG_OP(MaxPoolGradWithArgmaxCCE) .ATTR(data_mode, Int, 1) .ATTR(nan_opt, Int, 0) .OP_END_FACTORY_REG(MaxPoolGradWithArgmaxCCE) +/** +*@brief :upsample the layer + +*@par Inputs: +* one input, including: +*@li x: A tensor of type float16 or float32. +*@par Attributes: +*@li scale:scale factor of x +*@li stride_h:broadcast the axis of h +*@li stride_w:broadcast the axis of w +*@par Outputs: +*y: A tensor of type float16 or float32. +*/ +REG_OP(Upsample) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(scale, Float, 1) + .ATTR(stride_h, Int, 2) + .ATTR(stride_w, Int, 2) + .OP_END_FACTORY_REG(Upsample) + +/** +*@brief Spatial Pyramid Pooling, multi-level pooling. +* Pooling out(n, sigma(c*2^i*2^i)) tensor, i in range[0,pyramid_height). +*@par Inputs: +*x: An NCHW tensor, support float16 or float32 type. + +*@par Attributes: +* @li pyramid_height: An required int32. +* Multi-level pooling out from 2^0 to 2^(pyramid_height-1). +* @li pool_method: An optional int32, pooling method: 0-MAX, 1-AVE. +* Defaults to "0". + +*@par Outputs: +*y: A NCHW tensor, support float16 or float32 type. + +*@attention Constraints: +* @li pyramid_height: pyramid_heigjt should be in range [0,7). +* @li feature_size:input feture map h and w should be [1, 510]. + +*/ +REG_OP(SPP) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .REQUIRED_ATTR(pyramid_height, Int) + .ATTR(pool_method, Int, 0) + .OP_END_FACTORY_REG(SPP) } // namespace ge #endif // GE_OP_NN_POOLING_OPS_H diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h index 97ca6dc0..b8f4003e 100644 --- a/third_party/fwkacllib/inc/ops/nn_training_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h @@ -17,7 +17,6 @@ #ifndef GE_OP_TRAINING_OPS_H #define GE_OP_TRAINING_OPS_H -#include "../../../inc/external/graph/operator_reg.h" #include "../graph/operator_reg.h" namespace ge { /** @@ -69,47 +68,55 @@ REG_OP(ApplyAdaMax) .OP_END_FACTORY_REG(ApplyAdaMax) /** -*@brief Updates "var" according to the momentum scheme. Set use_nesterov = True if you -* want to use Nesterov momentum.\n -* computing process: \n -* accum = accum * momentum + grad\n -* var -= lr * accum +*@brief Updates "var" according to the AdaMax algorithm.\n +* t-1 mean previous period. +* m_t <- beta1 * m{t-1} + (1 - beta1) * grad\n +* v_t <- max(beta2 * v{t-1}, abs(grad))\n +* var <- var - lr / (1 - beta1^t) * m_t / (v_t + epsilon) * *@attention Constraints:\n * the input tensors must have the same shape. * *@par Inputs: -*@li var: A mutable tensor. Should be from a Variable(). -*@li accum: A mutable tensor. Has the same type as "var". +*@li var: A mutable tensor. Must be one of the following types: TensorType::NumberType(). * Should be from a Variable(). -*@li lr: A scalar. Has the same type as "var". +*@li m: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li v: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li beta1_power: A scalar. Has the same type as "var". +*@li lr: learning_rate. A scalar. Has the same type as "var". +*@li beta1: A scalar. Has the same type as "var". +*@li beta2: A scalar. Has the same type as "var". +*@li epsilon: A scalar. Has the same type as "var". *@li grad: A tensor for the gradient. Has the same type as "var". * -*@par Attributes: -*@li use_nesterov: An optional bool. Defaults to "False". -* If "True", the tensor passed to compute grad will be -* var - lr * momentum * accum, so in the end, the var you get is actually -* var - lr * momentum * accum. -* -*@li use_locking: An optional bool. Defaults to "False".\n -* If "True", updating of the "var", "ms", and "mom" tensors is protected by a lock; -* otherwise the behavior is undefined, but may exhibit less contention. +*@par Attributes:\n +* use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var", "ms", and "mom" tensors is protected +* by a lock; otherwise the behavior is undefined, but may exhibit less +* contention. * *@par Outputs: * var: A mutable tensor. Has the same type as input "var". * +* */ - -REG_OP(ApplyMomentum) +REG_OP(ApplyAdaMaxD) .INPUT(var, TensorType::NumberType()) - .INPUT(accum, TensorType::NumberType()) + .INPUT(m, TensorType::NumberType()) + .INPUT(v, TensorType::NumberType()) + .INPUT(beta1_power, TensorType::NumberType()) .INPUT(lr, TensorType::NumberType()) + .INPUT(beta1, TensorType::NumberType()) + .INPUT(beta2, TensorType::NumberType()) + .INPUT(epsilon, TensorType::NumberType()) .INPUT(grad, TensorType::NumberType()) - .INPUT(momentum, TensorType::NumberType()) .OUTPUT(var, TensorType::NumberType()) - .ATTR(use_nesterov, Bool, false) + .OUTPUT(m, TensorType::NumberType()) + .OUTPUT(v, TensorType::NumberType()) .ATTR(use_locking, Bool, false) - .OP_END_FACTORY_REG(ApplyMomentum) + .OP_END_FACTORY_REG(ApplyAdaMaxD) /** *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme. @@ -128,6 +135,7 @@ REG_OP(ApplyMomentum) *@par Outputs: *var: A Tensor. Has the same type and format as input "var". + */ REG_OP(SparseApplyAdagrad) .INPUT(var, TensorType({DT_FLOAT})) @@ -137,6 +145,7 @@ REG_OP(SparseApplyAdagrad) .INPUT(indices, TensorType({DT_INT32})) .OUTPUT(var, TensorType({DT_FLOAT})) .ATTR(use_locking, Bool, false) + .ATTR(update_slots, Bool, true) .OP_END_FACTORY_REG(SparseApplyAdagrad) /** @@ -156,6 +165,7 @@ REG_OP(SparseApplyAdagrad) *@par Outputs: *var: A Tensor. Has the same type and format as input "var". + */ REG_OP(SparseApplyAdagradD) .INPUT(var, TensorType({DT_FLOAT})) @@ -163,10 +173,120 @@ REG_OP(SparseApplyAdagradD) .INPUT(grad, TensorType({DT_FLOAT})) .INPUT(indices, TensorType({DT_INT32})) .OUTPUT(var, TensorType({DT_FLOAT})) + .OUTPUT(accum, TensorType({DT_FLOAT})) .REQUIRED_ATTR(lr, Float) .ATTR(use_locking, Bool, false) + .ATTR(update_slots, Bool, true) .OP_END_FACTORY_REG(SparseApplyAdagradD) +/** +*@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme. + +*@par Inputs: +* Five inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor of type float32. +*@li accum: An NCHW, NHWC, or ND Tensor of type float32. +*@li lr: An NCHW, NHWC, or ND Tensor of type float32. +*@li epsilon: An NCHW, NHWC, or ND Tensor of type float32. +*@li grad: An NCHW, NHWC, or ND Tensor of type float32. +*@li indices: An NCHW, NHWC, or ND Tensor of type float32. + +*@par Attributes: +*@li use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. +*@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False". + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". + +*/ +REG_OP(SparseApplyAdagradV2) + .INPUT(var, TensorType({DT_FLOAT})) + .INPUT(accum, TensorType({DT_FLOAT})) + .INPUT(lr, TensorType({DT_FLOAT})) + .INPUT(epsilon, TensorType({DT_FLOAT})) + .INPUT(grad, TensorType({DT_FLOAT})) + .INPUT(indices, TensorType({DT_INT32})) + .OUTPUT(var, TensorType({DT_FLOAT})) + .ATTR(use_locking, Bool, false) + .ATTR(update_slots, Bool, true) + .OP_END_FACTORY_REG(SparseApplyAdagradV2) + +/** +*@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme. + +*@par Inputs: +* Four inputs, including: +*@li var: An NCHW, NHWC, or ND Tensor of type float32. +*@li accum: An NCHW, NHWC, or ND Tensor of type float32. +*@li grad: An NCHW, NHWC, or ND Tensor of type float32. +*@li indices: An NCHW, NHWC, or ND Tensor of type int32. + +*@par Attributes: +*@li lr: Required, used for computation. +*@li epsilon: Required, used for computation. +*@li use_locking: An optional bool. Defaults to "False". If "True", the operation will be protected by a lock. +*@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False". + +*@par Outputs: +*var: A Tensor. Has the same type and format as input "var". +*accum: A Tensor. Has the same type and format as input "accum". + +*/ +REG_OP(SparseApplyAdagradV2D) + .INPUT(var, TensorType({DT_FLOAT})) + .INPUT(accum, TensorType({DT_FLOAT})) + .INPUT(grad, TensorType({DT_FLOAT})) + .INPUT(indices, TensorType({DT_INT32})) + .OUTPUT(var, TensorType({DT_FLOAT})) + .OUTPUT(accum, TensorType({DT_FLOAT})) + .REQUIRED_ATTR(lr, Float) + .REQUIRED_ATTR(epsilon, Float) + .ATTR(use_locking, Bool, false) + .ATTR(update_slots, Bool, true) + .OP_END_FACTORY_REG(SparseApplyAdagradV2D) + +/** +*@brief Updates "var" according to the momentum scheme. Set use_nesterov = True if you +* want to use Nesterov momentum.\n +* computing process: \n +* accum = accum * momentum + grad\n +* var -= lr * accum +* +*@attention Constraints:\n +* the input tensors must have the same shape. +* +*@par Inputs: +*@li var: A mutable tensor. Should be from a Variable(). +*@li accum: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li lr: A scalar. Has the same type as "var". +*@li grad: A tensor for the gradient. Has the same type as "var". +* +*@par Attributes: +*@li use_nesterov: An optional bool. Defaults to "False". +* If "True", the tensor passed to compute grad will be +* var - lr * momentum * accum, so in the end, the var you get is actually +* var - lr * momentum * accum. +* +*@li use_locking: An optional bool. Defaults to "False".\n +* If "True", updating of the "var", "ms", and "mom" tensors is protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. +* +*@par Outputs: +* var: A mutable tensor. Has the same type as input "var". +* +*/ + +REG_OP(ApplyMomentum) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(momentum, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .ATTR(use_nesterov, Bool, false) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyMomentum) REG_OP(ApplyMomentumCCE) .INPUT(var, TensorType::NumberType()) @@ -415,6 +535,130 @@ REG_OP(ApplyAdagrad) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ApplyAdagrad) +/** +*@brief Updates "var" according to the adagrad scheme.\n +* accum += grad * grad\n +* var -= lr * grad * (1 / sqrt(accum)) +* +*@attention Constraints:\n +* the input tensors must have the same shape. +* +*@par Inputs: +*@li var: A mutable tensor. Should be from a Variable(). +*@li accum: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li lr: A scalar. Has the same type as "var". +*@li grad: A tensor for the gradient. Has the same type as "var". +* +*@par Attributes: +* use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var", "ms", and "mom" tensors is protected +* by a lock; otherwise the behavior is undefined, but may exhibit less +* contention. +* +*@par Outputs: +*@li var: A mutable tensor. Has the same type as input "var". +*@li accum: A mutable tensor. Has the same type as input "var". +* +* +*/ +REG_OP(ApplyAdagradD) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .ATTR(update_slots, Bool, true) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyAdagradD) + +/** +* @brief Updates "var" according to the adagradv2 scheme.\n +* accum += grad * grad \n +* var -= lr * grad * (1 / sqrt(accum) + epsilon) +* +* @attention Constraints: +* the input tensors must have the same shape. +* +* @par Inputs: +* @li var: A mutable tensor. Must be one of the data types defined in +* TensorType::NumberType(). Should be from a Variable(). +* @li accum: A mutable tensor. Has the same type as "var". Should be from a +* Variable(). +* @li lr: A tensor for the learning rate. Has the same type as "var". Should be +* from a Variable(). +* @li grad: A tensor for the gradient. Has the same type as "var". Should be +* from a Variable(). +* @li epsilon: A scalar. Has the same type as "var". +* +* @par Attributes: +* @li update_slots: An optional bool. Defaults to "True". +* If "True", accum will be updated +* @li use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var" tensor is protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. +* +* @par Outputs: +* var: A mutable tensor. Has the same type as input "var". +* +* +*/ +REG_OP(ApplyAdagradV2) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(epsilon, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .ATTR(update_slots, Bool, true) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyAdagradV2) + + +/** +* @brief Updates "var" according to the adagradv2 scheme.\n +* accum += grad * grad \n +* var -= lr * grad * (1 / sqrt(accum) + epsilon) +* +* @attention Constraints: +* the input tensors must have the same shape. +* +* @par Inputs: +* @li var: A mutable tensor. Must be one of the data types defined in +* TensorType::NumberType(). Should be from a Variable(). +* @li accum: A mutable tensor. Has the same type as "var". Should be from a +* Variable(). +* @li lr: A tensor for the learning rate. Has the same type as "var". Should be +* from a Variable(). +* @li grad: A tensor for the gradient. Has the same type as "var". Should be +* from a Variable(). +* +* @par Attributes: +* @li epsilon: A scalar. Has the same type as "var". +* @li update_slots: An optional bool. Defaults to "True". +* If "True", accum will be updated +* @li use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var" tensor is protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. +* +* @par Outputs: +* var: A mutable tensor. Has the same type as input "var". +* +* +*/ +REG_OP(ApplyAdagradV2D) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .REQUIRED_ATTR(epsilon, Float) + .ATTR(update_slots, Bool, true) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyAdagradV2D) + /** *@brief Updates "var" according to the proximal adagrad scheme. @@ -458,6 +702,54 @@ REG_OP(ApplyAdagradDA) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ApplyAdagradDA) +/** +*@brief Updates "var" according to the proximal adagrad scheme. + +*@par Inputs: +*Eight inputs, including: +* @li var: A mutable Tensor. Must be one of the following types: +* TensorType::NumberType(). Should be a Variable Tensor. +* @li gradient_accumulator: A mutable Tensor. Must have the same +* type as "var". Should be a Variable Tensor. +* @li gradient_squared_accumulator: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. +* @li grad: A Tensor of the same type as "var", for the gradient. +* @li lr: A Tensor of the same type as "var". +* Scaling factor. Must be a scalar. +* @li l1: A Tensor of the same type as "var". +* L1 regulariation. Must be a scalar. +* @li l2: A Tensor of the same type as "var". +* L2 regulariation. Must be a scalar. +* @li global_step: A Tensor of type int32 or int64. +* Training step number. Must be a scalar. + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". +* If "True", updating of the var and accum tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. + +*@par Outputs: +*var: A mutable Tensor. Has the same type as "var". +*gradient_accumulator: A mutable Tensor. Has the same type as "var". +*gradient_squared_accumulator: A mutable Tensor. Has the same type as "var". + +*/ +REG_OP(ApplyAdagradDAD) + .INPUT(var, TensorType::NumberType()) + .INPUT(gradient_accumulator, TensorType::NumberType()) + .INPUT(gradient_squared_accumulator, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(l1, TensorType::NumberType()) + .INPUT(l2, TensorType::NumberType()) + .INPUT(global_step, TensorType({DT_INT32, DT_INT64})) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(gradient_accumulator, TensorType::NumberType()) + .OUTPUT(gradient_squared_accumulator, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyAdagradDAD) + /** *@brief Returns the dimension index in the destination data format given the one in * the source data format. @@ -560,9 +852,9 @@ REG_OP(SGD) * var: A mutable tensor. Has the same type as input "var". * * @attention Constraints: -* @li Note that in dense implementation of this algorithm, "ms" and "mom" will\n -* update even if "grad" is 0, but in this sparse implementation, "ms" and "mom"\n -* will not update in iterations during which "grad" is 0.\n +* @li Note that in dense implementation of this algorithm, "ms" and "mom" will \n +* update even if "grad" is 0, but in this sparse implementation, "ms" and "mom" \n +* will not update in iterations during which "grad" is 0. * @li The input tensors "var", "ms", "mom" and "grad" must have the same shape. */ REG_OP(ApplyRMSProp) @@ -599,14 +891,15 @@ REG_OP(ApplyRMSProp) * * @par Attributes: * @li use_locking: An optional "bool". Defaults to "False". If "True", updating\n -* of the "var", "ms", and "mom" tensors will be protected by a lock; otherwise -* the behavior is undefined, but may exhibit less contention. +* of the "var", "ms", and "mom" tensors will be protected by a lock; \n +* otherwise the behavior is undefined, but may exhibit less contention. * @li rho: A required scalar. Must have the same type as "var". * @li momentum: A required scalar. Must have the same type as "var". * @li epsilon: A required scalar. Must have the same type as "var". * * @par Outputs: * var: A mutable tensor. Must have the same type as input "var". +* * @attention Constraints: * @li Note that in dense implementation of this algorithm, "ms" and "mom" will\n * update even if "grad" is 0, but in this sparse implementation, "ms" and "mom"\n @@ -620,6 +913,8 @@ REG_OP(ApplyRMSPropD) .INPUT(lr, TensorType::NumberType()) .INPUT(grad, TensorType::NumberType()) .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(ms, TensorType::NumberType()) + .OUTPUT(mom, TensorType::NumberType()) .REQUIRED_ATTR(rho, Float) .REQUIRED_ATTR(momentum, Float) .REQUIRED_ATTR(epsilon, Float) @@ -826,12 +1121,27 @@ REG_OP(ApplyAdam) .INPUT(epsilon, TensorType::NumberType()) .INPUT(grad, TensorType::NumberType()) .OUTPUT(var, TensorType::NumberType()) - .OUTPUT(m, TensorType::NumberType()) - .OUTPUT(v, TensorType::NumberType()) .ATTR(use_locking, Bool, false) .ATTR(use_nesterov, Bool, false) .OP_END_FACTORY_REG(ApplyAdam) +REG_OP(ApplyAdamD) + .INPUT(var, TensorType::NumberType()) + .INPUT(m, TensorType::NumberType()) + .INPUT(v, TensorType::NumberType()) + .INPUT(beta1_power, TensorType::NumberType()) + .INPUT(beta2_power, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(beta1, TensorType::NumberType()) + .INPUT(beta2, TensorType::NumberType()) + .INPUT(epsilon, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(m, TensorType::NumberType()) + .OUTPUT(v, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .ATTR(use_nesterov, Bool, false) + .OP_END_FACTORY_REG(ApplyAdamD) /** *@brief Updates "var" according to the proximal adadelta scheme. @@ -870,94 +1180,142 @@ REG_OP(ApplyAdadelta) .OP_END_FACTORY_REG(ApplyAdadelta) /** -*@brief Updates "var" according to the ApplyMomentum algorithm. \n -* accum = accum * momentum + x1 * x2 -* if use_nesterov is True: -* var -= x1 * x2 * lr + accum * momentum * lr -* else: -* var -= accum * lr +*@brief Updates "var" according to the proximal adadelta scheme. *@par Inputs: -*Six inputs, including: +*Seven inputs, including: * @li var: A mutable Tensor of type TensorType::NumberType(). * Should be a Variable Tensor. * @li accum: A mutable Tensor of the same type as "var". * Should be a Variable Tensor. +* @li accum_update: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. * @li lr: A scalar of the same type as "var", for the scaling factor. -* @li x1: A Tensor of type TensorType::NumberType(). -* @li momentum: A scalar of the same type as "var". -* @li x2: A Tensor of the same type as "var". +* @li rho: A scalar of the same type as "var", for the decay factor. +* @li epsilon: A scalar of the same type as "var", for the constant factor. +* @li grad: A Tensor of the same type as "var", for the gradient. *@par Attributes: -*Two Attributes, including: -*@li use_nesterov: An optional bool. Defaults to "False". \n -* If True, the tensor passed to compute grad will be var - lr * momentum * accum, \n -* so in the end, the var you get is actually var - lr * momentum * accum. -*@li use_locking: An optional bool. Defaults to "False". \n -* If "True", updating of the "var", m", and "v" tensors will be protected \n -* by a lock; otherwise the behavior is undefined, but may exhibit less contention. +*use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var", "accum" and "accum_update" tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. *@par Outputs: -*var: A mutable Tensor. Has the same type as "var". +*@li var: A mutable Tensor. Has the same type as "var". +*@li accum: A mutable Tensor. Has the same type as "var". +*@li accum_update: A mutable Tensor. Has the same type as "var". + */ -REG_OP(FusedMulApplyMomentum) - .INPUT(var, TensorType::NumberType()) - .INPUT(accum, TensorType::NumberType()) - .INPUT(lr, TensorType::NumberType()) - .INPUT(x1, TensorType::NumberType()) - .INPUT(momentum, TensorType::NumberType()) - .INPUT(x2, TensorType::NumberType()) - .OUTPUT(var, TensorType::NumberType()) - .ATTR(use_nesterov, Bool, false) - .ATTR(use_locking, Bool, false) - .OP_END_FACTORY_REG(FusedMulApplyMomentum) +REG_OP(ApplyAdadeltaD) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(accum_update, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(rho, TensorType::NumberType()) + .INPUT(epsilon, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .OUTPUT(accum_update, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyAdadeltaD) /** -*@brief Updates "var" according to the ApplyMomentum algorithm. \n -* accum = accum * momentum + x1 * x2 -* if use_nesterov is True: -* var -= x1 * x2 * lr + accum * momentum * lr -* else: -* var -= accum * lr +* @brief Updates "var" according to the ApplyMomentum algorithm. \n +* accum = accum * momentum + x1 * x2 \n +* if use_nesterov is True: \n +* var -= x1 * x2 * lr + accum * momentum * lr \n +* else:\n +* var -= accum * lr +* +* @par Inputs: +* Six inputs, including: +* @li var: A mutable Tensor has type TensorType::NumberType(). +* Should be a Variable Tensor. +* @li accum: A mutable Tensor has the same type as "var". +* Should be a Variable Tensor. +* @li lr: A scalar has the same type as "var", for the scaling factor. +* @li x1: A Tensor has type TensorType::NumberType(). +* @li momentum: A scalar has the same type as "var". +* @li x2: A scalar has the same type as "var". +* +* @par Attributes: +* Two attributes, including: +* @li use_nesterov: An optional bool. Defaults to "False". \n +* If True, the tensor passed to compute grad will be var - lr * momentum * accum, \n +* so in the end, the var you get is actually var - lr * momentum * accum. +* @li use_locking: An optional bool. Defaults to "False". \n +* If "True", updating of the "var", m", and "v" tensors will be protected \n +* by a lock; otherwise the behavior is undefined, but may exhibit less contention. +* +* @par Outputs: +* Two outputs, including: +* @li var: A mutable Tensor has the same type as "var". +* @li accum: A mutable Tensor has the same type as "var". +*/ +REG_OP(FusedMulApplyMomentum) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(x1, TensorType::NumberType()) + .INPUT(momentum, TensorType::NumberType()) + .INPUT(x2, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .ATTR(use_nesterov, Bool, false) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(FusedMulApplyMomentum) -*@par Inputs: -*Six inputs, including: -* @li var: A mutable Tensor of type TensorType::NumberType(). +/** +* @brief Updates "var" according to the ApplyMomentum algorithm. \n +* accum = accum * momentum + x1 * x2 \n +* if use_nesterov is True: \n +* var -= x1 * x2 * lr + accum * momentum * lr \n +* else: \n +* var -= accum * lr +* +* @par Inputs: +* Seven inputs, including: +* @li var: A mutable Tensor of type float32. * Should be a Variable Tensor. -* @li accum: A mutable Tensor of the same type as "var". +* @li accum: A mutable Tensor has type TensorType::NumberType(). * Should be a Variable Tensor. -* @li lr: A scalar of the same type as "var", for the scaling factor. -* @li x1: A Tensor of type TensorType::NumberType(). -* @li momentum: A scalar of the same type as "var". -* @li x2: A Tensor of the same type as "var". - -*@par Attributes: -*Two Attributes, including: -*@li use_nesterov: An optional bool. Defaults to "False". \n -* If True, the tensor passed to compute grad will be var - lr * momentum * accum, \n -* so in the end, the var you get is actually var - lr * momentum * accum. -*@li use_locking: An optional bool. Defaults to "False". \n -* If "True", updating of the "var", m", and "v" tensors will be protected \n -* by a lock; otherwise the behavior is undefined, but may exhibit less contention. - -*@par Outputs: -*Two outputs, including: -*@li var: A Tensor. Has the same type as "var". -*@li var_copy: A Tensor. Has the same type as "var". +* @li lr: A scalar has the same type as "accum", for the scaling factor. +* @li x1: A Tensor has the same type as "accum". +* @li momentum: A scalar has the same type as "accum". +* @li x2: A scalar has the same type as "accum". +* @li var_copy: A Tensor has type float16. +* +* @par Attributes: +* Two Attributes, including: +* @li use_nesterov: An optional bool. Defaults to "False". \n +* If True, the tensor passed to compute grad will be var - lr * momentum * accum, \n +* so in the end, the var you get is actually var - lr * momentum * accum. +* @li use_locking: An optional bool. Defaults to "False". \n +* If "True", updating of the "var", m", and "v" tensors will be protected \n +* by a lock; otherwise the behavior is undefined, but may exhibit less contention. +* +* @par Outputs: +* Three outputs, including: +* @li var: A Tensor has the type float32. +* @li var_copy: A Tensor has the type float16. +* @li accum: A Tensor has the same type as input "accum". */ REG_OP(FusedMulApplyMomentumExtern) - .INPUT(var, TensorType::NumberType()) - .INPUT(accum, TensorType::NumberType()) - .INPUT(lr, TensorType::NumberType()) - .INPUT(x1, TensorType::NumberType()) - .INPUT(momentum, TensorType::NumberType()) - .INPUT(x2, TensorType::NumberType()) - .INPUT(var_copy, TensorType::NumberType()) - .OUTPUT(var, TensorType::NumberType()) - .OUTPUT(var_copy, TensorType::NumberType()) - .ATTR(use_nesterov, Bool, false) - .ATTR(use_locking, Bool, false) - .OP_END_FACTORY_REG(FusedMulApplyMomentumExtern) + .INPUT(var, TensorType(DT_FLOAT)) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(x1, TensorType::NumberType()) + .INPUT(momentum, TensorType::NumberType()) + .INPUT(x2, TensorType::NumberType()) + .INPUT(var_copy, TensorType(DT_FLOAT16)) + .OUTPUT(var, TensorType(DT_FLOAT)) + .OUTPUT(var_copy, TensorType(DT_FLOAT16)) + .OUTPUT(accum, TensorType::NumberType()) + .ATTR(use_nesterov, Bool, false) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(FusedMulApplyMomentumExtern) /** *@brief Update "g" according to the LARS algorithm. @@ -1051,6 +1409,7 @@ REG_OP(LarsV2Update) * @par Outputs: * var: A Tensor. Has the same type and format as input "var". + */ REG_OP(SparseApplyFtrl) .INPUT(var, TensorType({DT_FLOAT})) @@ -1092,6 +1451,9 @@ REG_OP(SparseApplyFtrl) * @par Outputs: * var: A Tensor. Has the same type and format as input "var". +* accum: A Tensor. Has the same type and format as input "accum". +* linear: A Tensor. Has the same type and format as input "linear". + */ REG_OP(SparseApplyFtrlD) .INPUT(var, TensorType({DT_FLOAT})) @@ -1100,6 +1462,8 @@ REG_OP(SparseApplyFtrlD) .INPUT(grad, TensorType({DT_FLOAT})) .INPUT(indices, TensorType({DT_INT32})) .OUTPUT(var, TensorType({DT_FLOAT})) + .OUTPUT(accum, TensorType({DT_FLOAT})) + .OUTPUT(linear, TensorType({DT_FLOAT})) .REQUIRED_ATTR(lr, Float) .REQUIRED_ATTR(l1, Float) .REQUIRED_ATTR(l2, Float) @@ -1135,6 +1499,7 @@ REG_OP(SparseApplyFtrlD) * @par Outputs: * var: A Tensor. Has the same type and format as input "var". + */ REG_OP(SparseApplyFtrlV2) .INPUT(var, TensorType({DT_FLOAT})) @@ -1179,6 +1544,9 @@ REG_OP(SparseApplyFtrlV2) * @par Outputs: * var: A Tensor. Has the same type and format as input "var". +* accum: A Tensor. Has the same type and format as input "accum". +* linear: A Tensor. Has the same type and format as input "linear". + */ REG_OP(SparseApplyFtrlV2D) .INPUT(var, TensorType({DT_FLOAT})) @@ -1187,6 +1555,8 @@ REG_OP(SparseApplyFtrlV2D) .INPUT(grad, TensorType({DT_FLOAT})) .INPUT(indices, TensorType({DT_INT32})) .OUTPUT(var, TensorType({DT_FLOAT})) + .OUTPUT(accum, TensorType({DT_FLOAT})) + .OUTPUT(linear, TensorType({DT_FLOAT})) .REQUIRED_ATTR(lr, Float) .REQUIRED_ATTR(l1, Float) .REQUIRED_ATTR(l2, Float) diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h index 42ab1a4c..992077ad 100644 --- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h +++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h @@ -72,6 +72,17 @@ REG_OP(TanhGrad) .OUTPUT(z, TensorType::UnaryDataType()) .OP_END_FACTORY_REG(TanhGrad) +/** +*@brief: Computes hyperbolic tangent of "x" element-wise. + +*@par Inputs: +*One input: +*x: A Tensor. Must be one of the following types: float16, float32, double, complex64, complex128, int32, int64 + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*/ REG_OP(Tanh) .INPUT(x, TensorType::UnaryDataType()) .OUTPUT(y, TensorType::UnaryDataType()) @@ -99,17 +110,17 @@ REG_OP(Relu) /** * @brief Computes rectified linear 6. -* activations = min(max(features, 0), 6). +* activations = min(max(x, 0), 6). * @par Inputs: -* features: A Tensor of type RealNumberType. +* x: A Tensor of type RealNumberType. * @par Outputs: -* activations: A Tensor of type RealNumberType. +* y: A Tensor of type RealNumberType. */ REG_OP(Relu6) - .INPUT(features, TensorType::RealNumberType()) - .OUTPUT(activations, TensorType::RealNumberType()) + .INPUT(x, TensorType::RealNumberType()) + .OUTPUT(y, TensorType::RealNumberType()) .OP_END_FACTORY_REG(Relu6) /** @@ -138,14 +149,11 @@ REG_OP(Relu6Grad) * @par Outputs: * A Tensor. Has the same type as "x". -* @attention Constraints: -* @li Ascend 310 provides only 1e-3 accuracy for the result. - * @see Relu() */ REG_OP(Sigmoid) - .INPUT(x, TensorType(UnaryDataType)) - .OUTPUT(y, TensorType(UnaryDataType)) + .INPUT(x, TensorType::UnaryDataType()) + .OUTPUT(y, TensorType::UnaryDataType()) .OP_END_FACTORY_REG(Sigmoid) /** @@ -168,8 +176,8 @@ REG_OP(Activation) .INPUT(x, TensorType::ALL()) .OUTPUT(y, TensorType::ALL()) /* - 0:sigmod, 1:relu, 2:tanh, 3:clipped ReLU, 4:Elu, - 5:leaky relu, 6:abs, 7:relu1, 8:softsign, 9:softplus + 0: sigmod, 1: relu, 2: tanh, 3: clipped ReLU, 4: Elu, + 5: leaky relu, 6: abs, 7: relu1, 8: softsign, 9: softplus */ .ATTR(mode, Int, 1) .ATTR(coef, Float, 0) @@ -182,29 +190,110 @@ REG_OP(ActivationGrad) .ATTR(mode, Int, 1) .OP_END_FACTORY_REG(ActivationGrad) +/** +*@brief Computes the binomial normal log likelihood (BNLL) output:\n +*if x>0, x+log(1+exp(-x)); otherwise log(1+exp(x)). + +*@par Inputs: +*x: A Tensor of type float16 or float32. + +*@par Outputs: +*y: A tensor. Has the same type and format as input "x". + +*/ +REG_OP(BNLL) + .INPUT(x, TensorType::FloatingDataType()) + .OUTPUT(y, TensorType::FloatingDataType()) + .OP_END_FACTORY_REG(BNLL) + +/** +*@brief Computes softplus: log(exp(x) + 1). + +*@par Inputs: +* One input:\n +*x: A Tensor of type float16 or float32. Up to 8D. + +*@par Outputs: +*y: The activations tensor. Has the same type and format as input "x" + +*/ REG_OP(Softplus) - .INPUT(features, TensorType::FloatingDataType()) - .OUTPUT(activations, TensorType::FloatingDataType()) + .INPUT(x, TensorType::FloatingDataType()) + .OUTPUT(y, TensorType::FloatingDataType()) .OP_END_FACTORY_REG(Softplus) +/** +*@brief Computes softplus gradients for a softplus operation. + +*@par Inputs: +*Two inputs: +* @li gradients: An NC1HWC0 or ND Tensor of type float16 or float32. +* @li features: An NC1HWC0 or ND Tensor of type float16 or float32. + + +*@par Outputs: +*backprops: A Tensor. Has the same type and format as input "gradients". + +*/ REG_OP(SoftplusGrad) .INPUT(gradients, TensorType::FloatingDataType()) .INPUT(features, TensorType::FloatingDataType()) .OUTPUT(backprops, TensorType::FloatingDataType()) .OP_END_FACTORY_REG(SoftplusGrad) +/** +*@brief Computes softsign: x/(abs(x) + 1). + +*@par Inputs: +* One input:\n +*x: A Tensor of type float16 or float32. Up to 8D. + +*@par Outputs: +*y: The activations tensor. Has the same type and format as "x" + +*/ REG_OP(Softsign) - .INPUT(features, TensorType::FloatingDataType()) - .OUTPUT(activations, TensorType::FloatingDataType()) + .INPUT(x, TensorType::FloatingDataType()) + .OUTPUT(y, TensorType::FloatingDataType()) .OP_END_FACTORY_REG(Softsign) +/** +*@brief Computes scaled exponential linear: scale * alpha * (exp(x) - 1). + +*@par Inputs: +* One input: \n +*x: A Tensor. Must be one of the following types: float16, float32, int32, int8. + +*@par Outputs: +*y: A Tensor. Has the same type and format as input "x". + +*@see Region() + +*/ REG_OP(Selu) - .INPUT(features, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE, + .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE, DT_INT8,DT_INT32})) - .OUTPUT(activations, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE, + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE, DT_INT8,DT_INT32})) .OP_END_FACTORY_REG(Selu) +/** +*@brief Computes rectified linear gradients for a ReLU operation. + +*@par Inputs: +* Two inputs, including: +*@li gradients: A Tensor. Must be one of the following types: float32, double, int32, int8, int16, int8, int64, uint16, float16, uint32, uint64 +*@li features: A Tensor. Must be one of the following types: float32, double, int32, int8, int16, int8, int64, uint16, float16, uint32, uint64 + +*@par Outputs: +*backprops: A Tensor. Must have the same type as"gradients". + +*@attention Constraints: +* The corresponding Relu operator needs to be called before using this operator on the network. + +*@see Relu + +*/ REG_OP(ReluGrad) .INPUT(gradients, TensorType::RealNumberType()) .INPUT(features, TensorType::RealNumberType()) @@ -234,20 +323,19 @@ REG_OP(ReluGradV2) .OP_END_FACTORY_REG(ReluGradV2) /** -*@brief Computes rectified linear: `max(x, 0)`. +*@brief Computes rectified linear: "max(x, 0)". * *@attention Constraints:\n -* The last dim must be mutiply of 8 -* The second output `mask` is the result of `y` use 'gt' compare with 0. +* The last dimension must be divisible by 8. +* The second output "mask" is "1" (for y >= 0) or "0" ( for y < 0). * *@par Inputs: * x: A tensor. Must be one of the following types: float32, float64, int32, uint8, * int16, int8, int64, uint16, float16, qint8. * *@par Outputs: -*@li y : A `Tensor`. Has the same type as `x`. -*@li mask : A `Tensor`. Must be the type : `uint8`. -* +*@li y: A tensor. Has the same type as "x". +*@li mask: A tensor of type uint8. */ REG_OP(ReluV2) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_INT8, DT_INT32, DT_INT16, DT_INT64, DT_UINT8, DT_UINT16, DT_QINT8})) @@ -255,29 +343,65 @@ REG_OP(ReluV2) .OUTPUT(mask, TensorType({DT_UINT8})) .OP_END_FACTORY_REG(ReluV2) +/** +*@brief Performs parametric ReLU. + +*@par Inputs: +* Two inputs, including: \n +*@li x: A multi-dimensional Tensor of type float16 or float32. +*@li weight: A Scalar or 1D Tensor of type float16 or float32, specifying the weight, the initial value of "a". The number of dimensions must be the same as the number of channels. + +*@par Outputs: +*y: An activated Tensor. Has the same dimensions with "x". + +*/ REG_OP(PRelu) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) .INPUT(weight, TensorType({DT_FLOAT, DT_FLOAT16})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) .OP_END_FACTORY_REG(PRelu) +/** +*@brief Performs the backpropagation of PRelu for training scenarios. + +*@par Inputs: +* Three inputs, including: \n +*@li grads: Input gradient. Multi-dimensional Tensors are supported. The data type can be float16 or float32. +*@li features: A multi-dimensional Tensor of type float16 or float32. +*@li weights: A Scalar or 1D Tensor of type float16 or float32, specifying the weight. The number of dimensions must be the same as the number of channels. + +*@par Outputs: +*@li dx: Reverse gradient of "features". Has the same dimensions and type as "features". +*@li da: Reverse gradient of "weight". Has the same dimensions and type as "features". + +*/ REG_OP(PReluGrad) - .INPUT(input_gradients, TensorType({DT_FLOAT16, DT_FLOAT})) - .INPUT(input_features, TensorType({DT_FLOAT16, DT_FLOAT})) - .INPUT(input_weights, TensorType({DT_FLOAT16, DT_FLOAT})) - .OUTPUT(output_backprops_dx, TensorType({DT_FLOAT16, DT_FLOAT})) - .OUTPUT(output_backprops_da, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(weights, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(dx, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(da, TensorType({DT_FLOAT16, DT_FLOAT})) .OP_END_FACTORY_REG(PReluGrad) /** -*@brief Computes exponential linear: `exp(x) - 1` if < 0, `x` otherwise. -* +*@brief Activation function fused from sigmoid and ReLU, with soft saturation on the left and no saturation on the right. + *@par Inputs: -* x : A `Tensor`. Must be one of the following types: `float16`, `float32`, `float64`. -* +*x: A float16 or float32, for the input data type. + +*@par Attributes: +*alpha: A float. Defines at which negative value the ELU saturates. Defaults to "1.0". + *@par Outputs: -* y : A `Tensor`. Has the same type as `x`. +*y: A float16 or float32, for the normalized result. + +*@attention Constraints: +*@li The input is of type float16 or float32. + +*@par Multiple batches supported or not +*Supported * +*@since V100R001C33 */ REG_OP(Elu) .INPUT(x, TensorType::FloatingDataType()) @@ -289,13 +413,13 @@ REG_OP(Elu) *@brief Computes gradients for the exponential linear (Elu) operation. * *@par Inputs: -*@li grads : A `Tensor`. Must be one of the following types: `float16`, `float32`, `float64`. +*@li grads: A tensor. Must be one of the following types: float16, float32, float64. * The backpropagated gradients to the corresponding Elu operation. -*@li activations : A `Tensor`. Must have the same type as `grads`. +*@li activations: A tensor. Has the same type as "grads". * The outputs of the corresponding Elu operation. * *@par Outputs: -* y : A `Tensor`. Has the same type as `grads`. +* y: A tensor. Has the same type as "grads". * */ REG_OP(EluGrad) @@ -304,6 +428,46 @@ REG_OP(EluGrad) .OUTPUT(y, TensorType::FloatingDataType()) .OP_END_FACTORY_REG(EluGrad) +/** +*@brief Computes the output as x if x > 0 and negative_slope * x if x <= 0. + +*@par Inputs: +* One input: +* x: A Tensor. Must be one of the following types: float32, float16, int32, int8, double. +* +*@par Attributes: +*negative_slope: A float32. Defaults to "0.0". +* +*@par Outputs: +*y: A Tensor. Has the same type as "x". +*/ +REG_OP(LeakyRelu) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8, DT_DOUBLE})) + .ATTR(negative_slope, Float, 0.0) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8, DT_DOUBLE})) + .OP_END_FACTORY_REG(LeakyRelu) + +/** +*@brief Computes the output as g if x > 0 and negative_slope * g if x <= 0. + +*@par Inputs: +* Two inputs, including: +* @li g: A Tensor. Must be one of the following types: float16, float32, double. +* @li x: A Tensor. Has the same type as "g". + +*@par Attributes: +*negative_slope: A float32. Defaults to "0.0". + +*@par Outputs: +*y: A Tensor. Has the same type as "g". +*/ +REG_OP(LeakyReluGrad) +.INPUT(g, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) +.INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) +.ATTR(negative_slope, Float, 0.0) +.OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) +.OP_END_FACTORY_REG(LeakyReluGrad) + } // namespace ge #endif // GE_OP_NONLINEAR_FUC_OPS_H diff --git a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h index eedd1c4c..daeea466 100644 --- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h +++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h @@ -70,7 +70,6 @@ REG_OP(NPUGetFloatStatus) .INPUT(addr, TensorType{DT_FLOAT}) .OUTPUT(data, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(NPUGetFloatStatus) - } // namespace ge #endif // GE_OP_NN_LOSS_SCALE_OPS_H diff --git a/third_party/fwkacllib/inc/ops/outfeed_ops.h b/third_party/fwkacllib/inc/ops/outfeed_ops.h index d5f38b30..049d83d9 100644 --- a/third_party/fwkacllib/inc/ops/outfeed_ops.h +++ b/third_party/fwkacllib/inc/ops/outfeed_ops.h @@ -22,6 +22,22 @@ namespace ge { +/** +*@brief Enqueue a Tensor on the computation outfeed. + +*@par Inputs: +*Inputs include: \n +*x: A Tensor. Must be one of the following types: float16, float32, \n +float64, int8, int16, uint16, uint8, int32, int64, uint32, uint64, \n +bool, double, string. + +*@par Attributes: +*channel_name: name of operator channel, default "". + +*@attention Constraints:\n +*-The implementation for OutfeedEnqueueOp on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(OutfeedEnqueueOp) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h index 5c0a1ce0..dc471909 100644 --- a/third_party/fwkacllib/inc/ops/pad_ops.h +++ b/third_party/fwkacllib/inc/ops/pad_ops.h @@ -149,7 +149,7 @@ REG_OP(Pad) REG_OP(PadD) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_FLOAT})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_FLOAT})) - .ATTR(paddings, ListListInt, {}) + .REQUIRED_ATTR(paddings, ListListInt) .OP_END_FACTORY_REG(PadD) /** diff --git a/third_party/fwkacllib/inc/ops/parsing_ops.h b/third_party/fwkacllib/inc/ops/parsing_ops.h index f790a03c..a8d1a757 100644 --- a/third_party/fwkacllib/inc/ops/parsing_ops.h +++ b/third_party/fwkacllib/inc/ops/parsing_ops.h @@ -22,6 +22,23 @@ namespace ge { +/** +*@brief Converts each string in the input Tensor to the specified numeric type. + +*@par Inputs: +*Inputs include: \n +*x: A Tensor. Must be one of the following types: string. + +*@par Attributes: +*out_type: The numeric type to interpret each string in string_tensor as. + +*@par Outputs: +*y: A Tensor. Has the same type as x. + +*@attention Constraints:\n +*-The implementation for StringToNumber on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(StringToNumber) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) diff --git a/third_party/fwkacllib/inc/ops/power_ops.h b/third_party/fwkacllib/inc/ops/power_ops.h new file mode 100644 index 00000000..b1f5bc24 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/power_ops.h @@ -0,0 +1,49 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_POWER_H + #define GE_OP_POWER_H + + #include "../graph/operator_reg.h" + + namespace ge { + +/** +*@brief Computes the output as (shift + scale * x) ^ power. + +*@par Inputs: +* x: A Tensor of type float16 or float32. + +*@par Attributes: +*@li power: Optional. Defaults to 1.0. +*@li scale: Optional. Defaults to 1.0. +*@li shift: Optional. Defaults to 0.0. + +*@par Outputs: +* y: A Tensor. Has the same type and shape as "x". +*/ + + REG_OP(Power) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(power, Float, 1.0) + .ATTR(scale, Float, 1.0) + .ATTR(shift, Float, 0.0) + .OP_END_FACTORY_REG(Power); + + } // namespace ge + + #endif // GE_OP_POWER_H diff --git a/third_party/fwkacllib/inc/ops/quantize_ops.h b/third_party/fwkacllib/inc/ops/quantize_ops.h index cac79015..235f2645 100644 --- a/third_party/fwkacllib/inc/ops/quantize_ops.h +++ b/third_party/fwkacllib/inc/ops/quantize_ops.h @@ -39,7 +39,7 @@ REG_OP(QuantizedInnerProduct) /** * @brief Dequantizes the input tensor into a float tensor.\n * [input_min_range, input_max_range] are scalar floats that specify the range -* for "output_data". +* for "output_data". \n * The "mode" attribute controls exactly which calculations are used to convert\n * the float values to their quantized equivalents. * @par Inputs: @@ -69,21 +69,53 @@ REG_OP(Dequantize) .ATTR(mode, String, "MIN_COMBINED") .OP_END_FACTORY_REG(Dequantize) +/** +*@brief Quantizes the input. + +*@par Inputs: +*x: An NC1HWC0 tensor of type float16 or float32, specifying the input. + +*@par Attributes: +*@li scale: A required float32, specifying the scaling ratio. +*@li offset: A required float16, specifying the offset. +*@li sqrt_mode: A optional bool, specifying whether to perform square root on "scale", either "True" or "False". Defaults to "False". +*@li round_mode: An optional string, specifying the float16 to int8 cast type. +* The value range is [Round, Floor, Ceiling, Truncate]. Defaults to "Round". + +*@par Outputs: +*y: The quantized output tensor of type int8 and with format NC1HWC0. +*/ REG_OP(AscendQuant) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32})) .OUTPUT(y, TensorType({DT_INT8})) .REQUIRED_ATTR(scale, Float) - .REQUIRED_ATTR(sqrt_mode, Bool) .REQUIRED_ATTR(offset, Float) + .ATTR(sqrt_mode, Bool, false) .ATTR(round_mode, String, "Round") .OP_END_FACTORY_REG(AscendQuant) +/** +*@brief Dequantizes the input. + +*@par Inputs: +*@li x: An NC1HWC0 tensor of type int32, specifying the input. +*@li deq_scale: An NC1HWC0 tensor of type float16 or uint64, specifying the scaling ratio. + +*@par Attributes: +*@li sqrt_mode: A optional bool, specifying whether to perform square root on "scale", either "True" or "False". Defaults to "False". +*@li relu_flag: A optional bool, specifying whether to perform ReLU, either "True" or "False". Defaults to "False". +*@li dtype: A optional int32, specifying the output data type. Defaults to "DT_FLOAT". + +*@par Outputs: +*y: The dequantized output tensor of type float16 or float32 and with format NC1HWC0. +*/ REG_OP(AscendDequant) .INPUT(x, TensorType({DT_INT32})) .INPUT(deq_scale, TensorType({DT_FLOAT16, DT_UINT64})) - .OUTPUT(y, TensorType({DT_FLOAT16})) - .REQUIRED_ATTR(sqrt_mode, Bool) - .REQUIRED_ATTR(relu_flag, Bool) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(sqrt_mode, Bool, false) + .ATTR(relu_flag, Bool, false) + .ATTR(dtype, Int, DT_FLOAT) .OP_END_FACTORY_REG(AscendDequant) } // namespace ge diff --git a/third_party/fwkacllib/inc/ops/ragged_array_ops.h b/third_party/fwkacllib/inc/ops/ragged_array_ops.h new file mode 100644 index 00000000..245f3551 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h @@ -0,0 +1,61 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_RAGGED_ARRAY_OPS_H +#define GE_OP_RAGGED_ARRAY_OPS_H + +#include "graph/operator.h" +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Gather ragged slices from `params` axis `0` according to `indices`. + +*@par Inputs: +*@li params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the \n +*params` RaggedTensor input. +*@li params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change \n +*at the python level from dense_values to flat_values, so dense_values is the \n +*deprecated name. +*@li indices: Indices in the outermost dimension of `params` of the values that should be \n +*gathered. +*@li OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain \n +*this number of `row_splits` tensors. This value should equal \n +*`indices.shape.ndims + params.ragged_rank - 1`. + +*@par Outputs: +*y:A Returns The `nested_row_splits` tensors that define the row-partitioning for the \n +*returned RaggedTensor.The `flat_values` for the returned RaggedTensor. + +*/ + +REG_OP(RaggedGather) + .DYNAMIC_INPUT(params_nested_splits, TensorType({DT_INT32, DT_INT64})) + .INPUT(params_dense_values, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \ + DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL})) + .INPUT(indices, TensorType({DT_INT32, DT_INT64})) + .DYNAMIC_OUTPUT(output_nested_splits, TensorType({DT_INT32, DT_INT64})) + .OUTPUT(output_dense_values, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \ + DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL})) + .REQUIRED_ATTR(Tsplits, Type) + .ATTR(PARAMS_RAGGED_RANK, Int, 1) + .ATTR(OUTPUT_RAGGED_RANK, Int, 0) + .OP_END_FACTORY_REG(RaggedGather) + +} // namespace ge + +#endif //GE_OP_RAGGED_ARRAY_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h new file mode 100644 index 00000000..8e07bdc5 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_RAGGED_CONVERSION_OPS_H +#define GE_OP_RAGGED_CONVERSION_OPS_H +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Converts a RaggedTensor into a SparseTensor with the same values. + +*@par Inputs: +*Two inputs, including: \n +*@li rt_nested_splits: A list of at least 1 Tensor objects with the same type \n +in: int32, int64. The row_splits for the RaggedTensor. +*@li rt_dense_values: A Tensor. The flat_values for the RaggedTensor \n +Must be one of the following types: bool, int8, int16, uint16, int32, \n +int64, double, float, float16. + +*@par Attributes: +*@li RAGGED_RANK: the dynamic of input rt_nested_splits with type int. +*@li Tsplits: A required attribute, the type is int64. + +*@par Outputs: +*@li sparse_indices: A Tensor of type int64. +*@li sparse_values: A Tensor. Has the same type as rt_dense_values. +*@li sparse_dense_shape: A Tensor of type int64. + +*/ +REG_OP(RaggedTensorToSparse) + .DYNAMIC_INPUT(rt_nested_splits, TensorType({DT_INT32, DT_INT64})) + .INPUT(rt_dense_values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) + .OUTPUT(sparse_indices, TensorType({DT_INT64})) + .OUTPUT(sparse_values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) + .OUTPUT(sparse_dense_shape, TensorType({DT_INT64})) + .ATTR(RAGGED_RANK, Int, 1) + .ATTR(Tsplits, Type, DT_INT64) + .OP_END_FACTORY_REG(RaggedTensorToSparse) +} // namespace ge +#endif // GE_OP_RAGGED_CONVERSION_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/ragged_math_ops.h b/third_party/fwkacllib/inc/ops/ragged_math_ops.h new file mode 100644 index 00000000..51797ff8 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_RAGGED_MATH_OPS_H +#define GE_OP_RAGGED_MATH_OPS_H + +#include "graph/operator.h" +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Returns a `RaggedTensor` containing the specified sequences of numbers. + +*@par Inputs: +*@li starts: The starts of each range. +*@li limits: The limits of each range. +*@li deltas: The deltas of each range. + +*@par Outputs: +*y:A Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`. + +*@attention Constraints: \n +*The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors. \n +*The vector inputs must all have the same size. Scalar inputs are broadcast \n +*to match the size of the vector inputs. + +*/ + +REG_OP(RaggedRange) + .INPUT(starts, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .INPUT(limits, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .INPUT(deltas, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .OUTPUT(rt_nested_splits, TensorType({DT_INT32, DT_INT64})) + .OUTPUT(rt_dense_values, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .REQUIRED_ATTR(Tsplits, Type) + .OP_END_FACTORY_REG(RaggedRange) + +} // namespace ge + +#endif //GE_OP_RAGGED_MATH_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/random_ops.h b/third_party/fwkacllib/inc/ops/random_ops.h index d9cdc1fc..41c1fff9 100644 --- a/third_party/fwkacllib/inc/ops/random_ops.h +++ b/third_party/fwkacllib/inc/ops/random_ops.h @@ -23,17 +23,61 @@ namespace ge { +/** +*@brief Draws samples from a multinomial distribution. + +*@par Inputs: +*Inputs include: \n +* @li logits: A Tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, \n + int64, bfloat16, uint16, half, uint32, uint64. 2-D Tensor with shape [batch_size, num_classes]. +* @li num_samples: A Tensor of type int32. 0-D. Number of independent samples to draw for each row slice. + +*@par Attributes: +*@li output_dtype: An optional type from: int32, int64. Defaults to int64. +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y_indices: A Tensor of type output_dtype. + +*@attention Constraints:\n +*-The implementation for Multinomial on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(Multinomial) .INPUT(logits, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(num_samples, TensorType({DT_INT32})) .OUTPUT(y, TensorType({DT_INT32, DT_INT64})) - .ATTR(output_dtype, Type, DT_INT64) + .ATTR(dtype, Type, DT_INT64) .ATTR(seed, Int, 0) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(Multinomial) +/** +*@brief Outputs random values from a normal distribution. + +*@par Inputs: +*Inputs include: \n +* @li shape: A Tensor. Must be one of the following types: int32, int64. \n + The shape of the output tensor. Batches are indexed by the 0th dimension. +* @li means: A Tensor. Must be one of the following types: half, bfloat16, float32, float64. +* @li stdevs: A Tensor. Must have the same type as means. +* @li min: A Tensor. Must have the same type as means. The minimum cutoff. May be -infinity. +* @li max: A Tensor. Must have the same type as means. + +*@par Attributes: +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor. Has the same type as means. + +*@attention Constraints:\n +*-The implementation for ParameterizedTruncatedNormal on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(ParameterizedTruncatedNormal) - .INPUT(shape, TensorType({DT_INT32})) + .INPUT(shape, TensorType({DT_INT32, DT_INT64})) .INPUT(means, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(stdevs, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(min, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -43,12 +87,46 @@ REG_OP(ParameterizedTruncatedNormal) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(ParameterizedTruncatedNormal) +/** +*@brief Computes the derivative of a Gamma random sample w.r.t. alpha. + +*@par Inputs: +*Inputs include: \n +* @li alpha: A Tensor. Must be one of the following types: float32, float64. +* @li sample: A Tensor. Must have the same type as alpha. + +*@par Outputs: +*y: A Tensor. Has the same type as alpha. + +*@attention Constraints:\n +*-The implementation for RandomGammaGrad on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomGammaGrad) .INPUT(alpha, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(sample, TensorType({DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(RandomGammaGrad) +/** +*@brief Outputs random values from the Gamma distribution(s) described by alpha. + +*@par Inputs: +*Inputs include: \n +* @li shape: A Tensor. Must be one of the following types: int32, int64. 1-D integer tensor. +* @li alpha: A Tensor. Must be one of the following types: half, float32, float64. + +*@par Attributes: +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor. Has the same type as alpha. + +*@attention Constraints:\n +*-The implementation for RandomGamma on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomGamma) .INPUT(shape, TensorType({DT_INT32, DT_INT64})) .INPUT(alpha, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -57,6 +135,26 @@ REG_OP(RandomGamma) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomGamma) +/** +*@brief Outputs random values from the Poisson distribution(s) described by rate. + +*@par Inputs: +*Inputs include: \n +* @li shape: A Tensor. Must be one of the following types: int32, int64. 1-D integer tensor. +* @li rate: A Tensor. Must be one of the following types: half, float32, float64, int32, int64. + +*@par Attributes: +*@li dtype: An optional type from: half, float32, float64, int32, int64. Defaults to int64. +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor of type dtype. + +*@attention Constraints:\n +*-The implementation for RandomPoisson on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomPoisson) .INPUT(shape, TensorType({DT_INT32, DT_INT64})) .INPUT(rate, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ @@ -68,17 +166,54 @@ REG_OP(RandomPoisson) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomPoisson) +/** +*@brief Randomly shuffles a tensor along its first dimension. + +*@par Inputs: +*Inputs include: \n +*x: A Tensor. The tensor to be shuffled. + +*@par Attributes: +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor. Has the same type as x. + +*@attention Constraints:\n +*-The implementation for RandomShuffle on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomShuffle) - .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, \ - DT_UINT64, DT_BOOL, DT_DOUBLE})) - .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, \ - DT_UINT64, DT_BOOL, DT_DOUBLE})) + .INPUT(x, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, + DT_UINT8, DT_INT8, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, + DT_COMPLEX128, DT_BOOL, DT_STRING, DT_RESOURCE})) + .OUTPUT(y, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, + DT_UINT8, DT_INT8, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, + DT_COMPLEX128, DT_BOOL, DT_STRING, DT_RESOURCE})) .ATTR(seed, Int, 0) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomShuffle) +/** +*@brief Outputs random values from a normal distribution. + +*@par Inputs: +*Inputs include: \n +*shape: A Tensor. Must be one of the following types: int32, int64. The shape of the output tensor. + +*@par Attributes: +*@li dtype: A type from: half, float16, float32, float64. The type of the output. +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor of type dtype. + +*@attention Constraints:\n +*-The implementation for RandomStandardNormal on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomStandardNormal) .INPUT(shape, TensorType({DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -87,6 +222,26 @@ REG_OP(RandomStandardNormal) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomStandardNormal) +/** +*@brief Outputs random integers from a uniform distribution. + +*@par Inputs: +*Inputs include: \n +* @li shape: A Tensor. Must be one of the following types: int32, int64. The shape of the output tensor. +* @li min: A Tensor. Must be one of the following types: int32, int64. 0-D. +* @li max: A Tensor. Must have the same type as minval. 0-D. + +*@par Attributes: +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor. Has the same type as min. + +*@attention Constraints:\n +*-The implementation for RandomUniformInt on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomUniformInt) .INPUT(shape, TensorType({DT_INT32, DT_INT64})) .INPUT(min, TensorType({DT_INT32, DT_INT64})) @@ -96,6 +251,25 @@ REG_OP(RandomUniformInt) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomUniformInt) +/** +*@brief Outputs random values from a uniform distribution. + +*@par Inputs: +*Inputs include: \n +*shape: A Tensor. Must be one of the following types: int32, int64. The shape of the output tensor. + +*@par Attributes: +*@li dtype: A type from: half, float16, float32, float64. The type of the output. +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*y: A Tensor of type dtype. + +*@attention Constraints:\n +*-The implementation for RandomUniform on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(RandomUniform) .INPUT(shape, TensorType({DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -104,6 +278,24 @@ REG_OP(RandomUniform) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomUniform) +/** +*@brief Outputs random values from a truncated normal distribution. + +*@par Inputs: +*Inputs include: \n +*shape: A Tensor. Must be one of the following types: int32, int64. + +*@par Attributes: +*@li seed: An optional int. Defaults to 0. +*@li seed2: An optional int. Defaults to 0. + +*@par Outputs: +*size: A Tensor of types: float16, float32, double. + +*@attention Constraints:\n +*-The implementation for TruncatedNormal on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(TruncatedNormal) .INPUT(shape, TensorType({ DT_INT32, DT_INT64 })) .OUTPUT(y, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE })) @@ -111,6 +303,27 @@ REG_OP(TruncatedNormal) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(TruncatedNormal) +/** +*@brief Generate random bit mask for dropout. + +*@par Inputs: +include: \n +*@li shape:The shape of the output tensor. +*@li prob:0-D. Number of bit 1. + +*@par Attributes: +*@li seed:If either seed or seed2 are set to be non-zero, the random number\n +*generator is seeded by the given seed. Otherwise, it is seeded by a random seed. +*@li seed2:A second seed to avoid seed collision. + +*@par Outputs: +*y:Output (1-D) random number using uint data format. + +*@attention Constraints:\n +*The output is aligned with 128 bits + +*@see DropOutGenMask() +*/ REG_OP(DropOutGenMask) .INPUT(shape, TensorType({ DT_INT32, DT_INT64 })) .INPUT(prob, TensorType({ DT_FLOAT16, DT_FLOAT })) @@ -143,6 +356,23 @@ REG_OP(LinSpaceD) .OUTPUT(output, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(LinSpaceD) +/** +*@brief Generates values in an interval. + +*@par Inputs:\n +* Four ND inputs, including: +*@li input_assist: A 1D Tensor of type float32. +*@li input_start: A 1D Tensor of type float32, for the first entry in the range. +*@li input_stop: A 1D Tensor of type float32, for the last entry in the range. +*@li input_num: A 1D Tensor of type int32, for the common difference of the entries. + +*@par Outputs:\n +*output_op: A 1D Tensor of type float32. + +*@attention Constraints:\n +* "input_assist" is a sequence of "input_num" evenly-spaced values beginning at 0 with an common difference of 1. + +*/ REG_OP(LinSpace) .INPUT(start, TensorType({DT_FLOAT, DT_DOUBLE})) .INPUT(stop, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -159,6 +389,25 @@ REG_OP(Dropout) .ATTR(beta, Float, 0.0) .OP_END_FACTORY_REG(Dropout) +/** +*@brief Shuffle index of no-zero element. + +*@par Inputs: +include: \n +*x:A tensor <= 5-D. + +*@par Attributes: +*@li count:the count of output, if 0, out all no-zero elements. +*@li seed:If either seed or seed2 are set to be non-zero, the random number generator is seeded by the given seed. + Otherwise, it is seeded by a random seed. +*@li seed2:A second seed to avoid seed collision. + +*@par Outputs: +*@li y:2-D tensor, no-zero element index. +*@li mask:1-D, whether the corresponding index is valid. + +*@see RandomChoiceWithMask() +*/ REG_OP(RandomChoiceWithMask) .INPUT(x, TensorType({DT_BOOL})) .OUTPUT(y, TensorType({DT_INT32})) @@ -168,6 +417,32 @@ REG_OP(RandomChoiceWithMask) .ATTR(seed2, Int, 0) .OP_END_FACTORY_REG(RandomChoiceWithMask) +/** +*@brief Permutes data in the channel dimension of the input + +*@par Inputs: +*Inputs including: \n +* @li x: A required Tensor. Must be one of the following types: + float16, float32, int8, uint8, int16, uint16, int32, uint32, int64, uint64. + +*@par Attributes: +*@li group: A required int32, specifying the number of groups to split the channel dimension into. Defaults to "1". + +*@par Outputs: +*y: A required Tensor. Has same type and shape as "x". Must be one of the following types: + float16, float32, int8, uint8, int16, uint16, int32, uint32, int64, uint64. + +*@attention Constraints:\n +*@li "group" must be greater than 0 and must evenly divide the channel dimension size. +*@li The format of input "x" must be NCHW. +*/ +REG_OP(ShuffleChannel) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT16, + DT_UINT16, DT_INT32, DT_UINT32,DT_INT64,DT_UINT64})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT16, + DT_UINT16, DT_INT32, DT_UINT32,DT_INT64,DT_UINT64})) + .ATTR(group, Int, 1) + .OP_END_FACTORY_REG(ShuffleChannel) } // namespace ge #endif // GE_OP_RANDOM_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h index d7882df3..0ba3e17f 100644 --- a/third_party/fwkacllib/inc/ops/reduce_ops.h +++ b/third_party/fwkacllib/inc/ops/reduce_ops.h @@ -20,12 +20,49 @@ #include "../graph/operator_reg.h" namespace ge { +/** +*@brief Performs reduced batch normalization. + +*@par Inputs:\n +*x: A 5D Tensor of type float16 or float32, with format NC1HWC0. + +*@par Outputs: +*@li sum: A 1D Tensor of type float32 for SUM reduced "x". +*@li square_sum: A 1D Tensor of type float32 for SUMSQ reduced "x". + +*@attention Constraints:\n +* This operator is a BatchNorm fusion operator for updating the moving averages for training. \n This operator is used in conjunction with BNTrainingUpdate. +*/ REG_OP(BNTrainingReduce) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .OUTPUT(sum, TensorType({DT_FLOAT})) .OUTPUT(square_sum, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(BNTrainingReduce) +/** +*@brief Performs the backpropagation of BatchNorm. + +*@par Inputs: +* Seven inputs, including: \n +*@li grads: A 5D Tensor of type float16 or float32, with format NC1HWC0, for the gradient. +*@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0. +*@li diff_scale: A 5D Tensor of type float32, with format NC1HWC0, for the mean of "x". +*@li diff_offset: A 5D Tensor of type float32, with format NC1HWC0, for the variance of "x". +*@li scale: A 5D Tensor of type float32, with format NC1HWC0. +*@li batch_mean: A 5D Tensor of type float32, with format NC1HWC0, for the mean of "x". +*@li batch_variance: A 5D Tensor of type float32, with format NC1HWC0, for the variance of "x". + +*@par Attributes: +*epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x". + +*@par Outputs: +*y: A Tensor of type float16 or float32, with format NC1HWC0, for the offset of "x". + +*@attention Constraints: +* The preceding layer of this operator must be BNTrainingUpdateGrad. + +*@see BNTrainingUpdateGrad +*/ REG_OP(BNTrainingReduceGrad) .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) @@ -38,6 +75,35 @@ REG_OP(BNTrainingReduceGrad) .ATTR(epsilon, Float, 0.0001) .OP_END_FACTORY_REG(BNTrainingReduceGrad) +/** +*@brief Performs reduced batch normalization. + +*@par Inputs:\n +* Seven inputs, including: (NC1HWC0 supported) +*@li x: A 5D Tensor of type float16 or float32. +*@li sum: A 1D Tensor of type float32 for the output of operator BNTrainingReduce. +*@li square_sum: A 1D Tensor of type float32 for the output of operator BNTrainingReduce. +*@li scale: A 1D Tensor of type float32, for the scaling factor. +*@li offset: A 1D Tensor of type float32, for the scaling offset. +*@li mean: A 1D Tensor of type float32, for the updated mean. +*@li variance: A 1D Tensor of type float32, for the updated variance. + +*@par Attributes: +*@li epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero. +*@li factor: A required float32, specifying the weight for updating the mean and variance. + +*@par Outputs:\n +* Five outputs, including: (NC1HWC0 supported) +*@li y: A 5D Tensor of type float16 or float32, for normalized "x". +*@li mean: A 5D Tensor of type float32, for the updated mean. +*@li variance: A 5D Tensor of type float32, for the updated variance. +*@li batch_mean: A 1D Tensor of type float32, for the mean of "x". +*@li batch_variance: A 1D Tensor of type float32, for the variance of "x". + +*@attention Constraints: +*@li This operator is a BatchNorm fusion operator for updating the moving averages for training. \n This operator is used in conjunction with BNTrainingReduce. +*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction. +*/ REG_OP(BNTrainingUpdate) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(sum, TensorType({DT_FLOAT})) @@ -55,6 +121,26 @@ REG_OP(BNTrainingUpdate) .OUTPUT(batch_variance, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(BNTrainingUpdate) +/** +*@brief Performs batch normalization for inference. + +*@par Inputs:\n +* Five inputs, including: (NC1HWC0 supported) +*@li x: A 5D Tensor of type float16 or float32. +*@li scale: A 5D Tensor of type float32, for the scaling factor. +*@li offset: A 5D Tensor of type float32, for the scaling offset. +*@li mean: A 5D Tensor of type float32, for the mean. +*@li variance: A 5D Tensor of type float32, for the variance. + +*@par Attributes: +*epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001". + +*@par Outputs:\n +*y: A 5D Tensor of type float16 or float32 for the normalized "x". + +*@attention Constraints: +*For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction. +*/ REG_OP(BNInfer) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(scale, TensorType({DT_FLOAT})) @@ -65,6 +151,31 @@ REG_OP(BNInfer) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) .OP_END_FACTORY_REG(BNInfer) +/** +*@brief Performs reduced batch normalization. For some scene which don't contain +assignmoving average. + +*@par Inputs:\n +* Five inputs, including: (NC1HWC0 supported) +*@li x: A 5D Tensor of type float16 or float32. +*@li sum: A 5D Tensor of type float32 for the output of operator BNTrainingReduce. +*@li square_sum: A 5D Tensor of type float32 for the output of operator BNTrainingReduce. +*@li scale: A 5D Tensor of type float32, for the scaling factor. +*@li offset: A 5D Tensor of type float32, for the scaling offset. + +*@par Attributes: +*epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero. + +*@par Outputs:\n +* Three outputs, including: (NC1HWC0 supported) +*@li y: A 5D Tensor of type float16 or float32, for normalized "x". +*@li batch_mean: A 5D Tensor of type float32, for the mean of "x". +*@li batch_variance: A 5D Tensor of type float32, for the variance of "x". + +*@attention Constraints: +*@li This operator is used in conjunction with BNTrainingReduce. +*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction. +*/ REG_OP(BNTrainingUpdateV2) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(sum, TensorType({DT_FLOAT})) @@ -77,6 +188,38 @@ REG_OP(BNTrainingUpdateV2) .OUTPUT(batch_variance, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(BNTrainingUpdateV2) +REG_OP(BNTrainingUpdateV3) + .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(sum, TensorType({DT_FLOAT})) + .INPUT(square_sum, TensorType({DT_FLOAT})) + .INPUT(scale, TensorType({DT_FLOAT})) + .INPUT(offset, TensorType({DT_FLOAT})) + .REQUIRED_ATTR(epsilon, Float) + .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(batch_mean, TensorType({DT_FLOAT})) + .OUTPUT(batch_variance, TensorType({DT_FLOAT})) + .OUTPUT(reserve_1, TensorType({DT_FLOAT})) + .OUTPUT(reserve_2, TensorType({DT_FLOAT})) + .OP_END_FACTORY_REG(BNTrainingUpdateV3) + +/** +*@brief Performs the backpropagation of BatchNorm. + +*@par Inputs: +* Four inputs, including: \n +*@li grads: A 5D Tensor of type float16 or float32, with format NC1HWC0, for the gradient. +*@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0. +*@li batch_mean: A 5D Tensor of type float32, with format NC1HWC0, for the mean of "x". +*@li batch_variance: A 5D Tensor of type float32, with format NC1HWC0, for the variance of "x". + +*@par Attributes: +*epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x". + +*@par Outputs: +*@li diff_scale: A Tensor of type float32, with format NC1HWC0, for the offset of "scale". +*@li diff_offset: A Tensor of type float32, with format NC1HWC0, for the offset of "offset". + +*/ REG_OP(BNTrainingUpdateGrad) .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) @@ -87,6 +230,24 @@ REG_OP(BNTrainingUpdateGrad) .OUTPUT(diff_offset, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(BNTrainingUpdateGrad) +/** +*@brief Performs the backpropagation of BatchNorm for inference. + +*@par Inputs: +* Three inputs, including: \n +*@li grads: A 5D Tensor of type loat16 or float32, with format NC1HWC0, for the gradient. +*@li scale: A 5D Tensor of type float32, with format NC1HWC0. +*@li batch_variance: A 5D Tensor of type float32, with format NC1HWC0. It is an output of BatchNorm. + +*@par Attributes: +*epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x". + +*@par Outputs: +*x_backprop: A Tensor of type float16 or float32, with format NC1HWC0, for the offset of "x". + +*@attention Constraints: +* The preceding layer of this operator must be operator BatchNorm. +*/ REG_OP(BNInferGrad) .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT})) .INPUT(scale, TensorType({DT_FLOAT})) @@ -95,17 +256,47 @@ REG_OP(BNInferGrad) .ATTR(epsilon, Float, 0.0001) .OP_END_FACTORY_REG(BNInferGrad) +/** +*@brief Computes the sum of elements across dimensions of a tensor. + +*@par Inputs: +* Two inputs, including: \n +*@li x: A Tensor of type float16 or float32. Up to 8D. +*@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce. + +*@par Attributes: +*keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false". + +*@par Outputs: +*y: The reduced tensor. Has the same type and format as input "x". + +*/ REG_OP(ReduceSum) .INPUT(x, TensorType::NumberType()) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::NumberType()) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceSum) +/** +*@brief Computes the sum of elements across dimensions of a tensor. + +*@par Inputs: +* One input: \n +*x: A Tensor. Up to 8D. Must be one of the following types: float16, float32, int32, int8, uint8. + +*@par Attributes: +*@li axes: A required 1D list or tuple of int32 or int64. Specifies the dimensions to reduce. +*@li keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false". + +*@par Outputs: +*y: The reduced tensor. Has the same type and format as input "x". + +*/ REG_OP(ReduceSumD) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT32})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceSumD) @@ -128,7 +319,7 @@ REG_OP(ReduceSumD) REG_OP(ReduceAllD) .INPUT(x, TensorType({DT_BOOL})) .OUTPUT(y, TensorType({DT_BOOL})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceAllD) @@ -148,7 +339,7 @@ REG_OP(ReduceAllD) */ REG_OP(ReduceAll) .INPUT(x, TensorType({DT_BOOL})) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y, TensorType({DT_BOOL})) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceAll) @@ -166,10 +357,11 @@ REG_OP(ReduceAll) *@par Outputs: *y: A Tensor. Has the same type and format as input "x". + */ REG_OP(ReduceProd) .INPUT(x,TensorType::NumberType()) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y,TensorType::NumberType()) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceProd) @@ -182,7 +374,7 @@ REG_OP(ReduceProd) *x: A Tensor. Must be one of the following types: float16, float, int8, uint8. *@par Attributes: -*@li axis: A required int8, int16, int32, or int64. Specifies the dimensions to reduce. No default value. +*@li axes: A required int8, int16, int32, or int64. Specifies the dimensions to reduce. No default value. *@li keep_dims: An optional bool. If "True", retains reduced dimensions with length 1. Defaults to "False". *@par Outputs: @@ -190,11 +382,12 @@ REG_OP(ReduceProd) *@attention Constraints: * "keep_dims" is in the range [-rank(input_tensor), rank(input_tensor)]. + */ REG_OP(ReduceProdD) .INPUT(x,TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_INT32, DT_FLOAT16})) .OUTPUT(y,TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_INT32, DT_FLOAT16})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceProdD) @@ -204,7 +397,7 @@ REG_OP(ReduceProdD) *@par Inputs: *Two inputs, including: * @li x: A Tensor. Must be one of the following types: float16, float32, int8, uint8. -* @li axis: The dimensions to reduce. Must be one of the following types: int, list, tuple, NoneType.\n +* @li axes: The dimensions to reduce. Must be one of the following types: int, list, tuple, NoneType.\n * - If None (the default), reduces all dimensions.\n * - Must be in the range [-rank(x), rank(x)). @@ -217,7 +410,7 @@ REG_OP(ReduceProdD) */ REG_OP(ReduceMean) .INPUT(x, TensorType::NumberType()) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::NumberType()) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceMean) @@ -230,7 +423,7 @@ REG_OP(ReduceMean) * @li x: A Tensor. Must be one of the following types: float16, float32, int8, uint8. *@par Attributes: -*@li axis: The dimensions to reduce. Must be one of the following types: int, list, tuple, NoneType. \n +*@li axes: The dimensions to reduce. Must be one of the following types: int, list, tuple, NoneType. \n * If None (the default), reduces all dimensions. \n * Must be in the range [-rank(x), rank(x)). \n *@li keep_dims: A bool or NoneType. \n @@ -242,13 +435,31 @@ REG_OP(ReduceMean) REG_OP(ReduceMeanD) .INPUT(x, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT, DT_INT8, DT_UINT8})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32, DT_FLOAT, DT_INT8, DT_UINT8})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceMeanD) +/** +*@brief Returns the maximum of elements across dimensions of a Tensor. + +*@par Inputs: +* Two inputs, including: \n +*@li x: A multi-dimensional Tensor of type float16, float32, or int16. +*@li axes: A Scalar of type int32, specifying the axes information of the index with the maximum value. + +*@par Attributes: +*keep_dims: A bool, specifying whether to keep dimensions for the output Tensor. Defaults to "false". + +*@par Outputs: +*y: A multi-dimensional Tensor, specifying the maximum value of the corresponding axis in the tensor. Has the same type as "x". (If "keep_dims" is set to "false", the output dimensions are reduced by "dimension" compared with that of "x". Otherwise, the output has one fewer dimension than "x".) + +*@attention Constraints: +* The value range of "axes" is [-dims, dims - 1]. "dims" indicates the dimension length of "x". + +*/ REG_OP(ReduceMax) .INPUT(x, TensorType::NumberType()) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::NumberType()) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceMax) @@ -261,7 +472,7 @@ REG_OP(ReduceMax) *@par Attributes: * Two attributes, including: \n -*@li axis: A required listint, specifying the axis information of the index with the maximum value. +*@li axes: A required listint, specifying the axes information of the index with the maximum value. *@li keep_dims: A bool, specifying whether to keep dimensions for the output Tensor. Defaults to "false". *@par Outputs: @@ -275,31 +486,65 @@ REG_OP(ReduceMaxD) DT_FLOAT16, DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_FLOAT16, DT_INT32})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceMaxD) +/** +*@brief Computes the minimum of elements across dimensions of a tensor. + +*@par Inputs: +*@li input_tensor: A Tensor. Must be one of the following types: float16, float32, int8, uint8. +*@li axes: A Tensor of type int8 or int32. Specifies the dimensions to reduce. Defaults to "None". + +*@par Attributes:\n +*keep_dims: An optional bool. If "True", reduced dimensions will be retained. Defaults to "False". + +*@par Outputs:\n +*output_tensor: A Tensor. Must be one of the following types: float16, float32, int8, uint8. + +*@attention Constraints:\n +* If "axes = None", all dimensions will be reduced. "axes" must be in the range [-rank(input_shape), rank(input_shape)). + +*/ REG_OP(ReduceMin) .INPUT(x, TensorType::NumberType()) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::NumberType()) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceMin) +/** +*@brief Computes the minimum of elements across dimensions of a tensor. + +*@par Inputs:\n +*input_min: A Tensor. Must be one of the following types: float16, float32, int8, uint8. + +*@par Attributes: +*@li axes: An optional int32, list, tuple, or NoneType value. Specifies the dimensions to reduce. Defaults to "None". +*@li keep_dims: An optional bool or NoneType value. If "True", reduced dimensions will be retained. Defaults to "None" (equivalent to "False"). + +*@par Outputs:\n +*output_min: A Tensor. Must be one of the following types: float16, float32, int8, uint8. + +*@attention Constraints:\n +* If "axes = None", all dimensions will be reduced. "axes" must be in the range [-rank(input_shape), rank(input_shape)). + +*/ REG_OP(ReduceMinD) .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8})) .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceMinD) /** *@brief Computes the "logical or" of elements across dimensions of a tensor.\n -* Reduces `x` along the dimensions given in `axis`. -* Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each -* entry in `axis`. If `keep_dims` is true, the reduced dimensions +* Reduces "x" along the dimensions given in "axes". +* Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each +* entry in "axes". If "keep_dims" is true, the reduced dimensions * are retained with length 1. * -* If `axis` is None, all dimensions are reduced, and a +* If "axes" is None, all dimensions are reduced, and a * tensor with a single element is returned. * *@attention Constraints:\n @@ -307,54 +552,84 @@ REG_OP(ReduceMinD) * *@par Inputs: *@li x : The boolean tensor to reduce. -*@li axis : The dimensions to reduce. If `None` (the default), reduces all -* dimensions. Must be in the range `[-rank(x), rank(x))`. +*@li axes: The dimensions to reduce. If "None" (default), reduces all +* dimensions. Must be in the range "[-rank(x), rank(x))". * *@par Attributes: -* keep_dims : If true, retains reduced dimensions with length 1. +* keep_dims: If true, retains reduced dimensions with length 1. * *@par Outputs: -* y : The reduced tensor +* y: The reduced tensor * */ REG_OP(ReduceAny) .INPUT(x, TensorType({DT_BOOL})) - .INPUT(axis, TensorType::IndexNumberType()) + .INPUT(axes, TensorType::IndexNumberType()) .OUTPUT(y, TensorType({DT_BOOL})) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceAny) /** *@brief Computes the "logical or" of elements across dimensions of a tensor.\n -* Reduces `x` along the dimensions given in `axis`. -* Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each -* entry in `axis`. If `keep_dims` is true, the reduced dimensions +* Reduces "x" along the dimensions given in "axes". +* Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each +* entry in "axes". If "keep_dims" is true, the reduced dimensions * are retained with length 1. * -* If `axis` is None, all dimensions are reduced, and a +* If "axis" is None, all dimensions are reduced, and a * tensor with a single element is returned. * *@attention Constraints:\n * Only support bool * *@par Inputs: -* x : The boolean tensor to reduce. +* x: The boolean tensor to reduce. * *@par Attributes: -*@li axis : The dimensions to reduce. If `None` (the default), reduces all -* dimensions. Must be in the range `[-rank(x), rank(x))`. -*@li keep_dims : If true, retains reduced dimensions with length 1. +*@li axes: The dimensions to reduce. If "None" (default), reduces all +* dimensions. Must be in the range "[-rank(x), rank(x))". +*@li keep_dims: If true, retains reduced dimensions with length 1. * *@par Outputs: -* y : The reduced tensor +* y: The reduced tensor * */ REG_OP(ReduceAnyD) .INPUT(x, TensorType({DT_BOOL})) .OUTPUT(y, TensorType({DT_BOOL})) - .REQUIRED_ATTR(axis, ListInt) + .REQUIRED_ATTR(axes, ListInt) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(ReduceAnyD) +/** +*@brief Compute reduction on dimensions specified by "axis". +*Four reduction operations are provided: +*SUM Computes the sum of elements across specified dimensions of a tensor. +*ASUM Computes the sum of absolute values of elements across specified dimensions of a tensor. +*SUMSQ Computes the sum of squares of elements across specified dimensions of a tensor. +*SUMSQ Computes the mean values of elements across specified dimensions of a tensor. + +*@par Inputs: +*x: A Tensor of type float16 or float32 + +*@par Attributes: +*@li operation: An optional int32 from 1(SUM), 2(ASUM), 3(SUMSQ), and 4(MEAN), +*specifying the reduction algorithm. Defaults to 1. +*@li axis: An optional int32, specifying the first axis to reduce. Defaults to "0". +*The value range is [-N, N-1], where N is the input tensor rank. +*@li coeff: An optional float32, specifying the scale coefficient. Defaults to "1.0". + +*@par Outputs: +*y: A Tensor. Has the same type as "x". + +*@attention Constraints: The Reduction operator supports type float16 only on the device chip. +*/ +REG_OP(Reduction) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(operation, Int, 1) + .ATTR(axis, Int, 0) + .ATTR(coeff, Float, 1.0) + .OP_END_FACTORY_REG(Reduction); } //namespace ge diff --git a/third_party/fwkacllib/inc/ops/resource_variable_ops.h b/third_party/fwkacllib/inc/ops/resource_variable_ops.h new file mode 100644 index 00000000..04aadf40 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/resource_variable_ops.h @@ -0,0 +1,56 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_RESOURCE_VARIABLE_OPS_H +#define GE_OP_RESOURCE_VARIABLE_OPS_H + +#include "graph/operator.h" +#include "graph/operator_reg.h" + +namespace ge { + +REG_OP(VarHandleOp) + .ATTR(container, String, "") + .ATTR(shared_name, String, "") + .REQUIRED_ATTR(dtype, Type) + .ATTR(shape, ListInt, ge::UNKNOWN_SHAPE) + .OUTPUT(y, TensorType({DT_RESOURCE})) + .OP_END_FACTORY_REG(VarHandleOp) + +REG_OP(AssignVariableOp) + .INPUT(resource, TensorType({DT_RESOURCE})) + .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + .REQUIRED_ATTR(dtype, Type) + .OP_END_FACTORY_REG(AssignVariableOp) + +REG_OP(AssignAddVariableOp) + .INPUT(resource, TensorType({DT_RESOURCE})) + .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + .REQUIRED_ATTR(dtype, Type) + .OP_END_FACTORY_REG(AssignAddVariableOp) + +REG_OP(AssignSubVariableOp) + .INPUT(resource, TensorType({DT_RESOURCE})) + .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + .REQUIRED_ATTR(dtype, Type) + .OP_END_FACTORY_REG(AssignSubVariableOp) + +} // namespace ge + +#endif //GE_OP_RESOURCE_VARIABLE_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h index 8b0157fb..abd98695 100644 --- a/third_party/fwkacllib/inc/ops/rnn.h +++ b/third_party/fwkacllib/inc/ops/rnn.h @@ -149,6 +149,86 @@ REG_OP(BasicLSTMCellCStateGrad) .ATTR(forget_bias, Float, 1.0) .ATTR(activation, String, "tanh") .OP_END_FACTORY_REG(BasicLSTMCellCStateGrad) + +/** +*@brief: RNN operator. +*@par Inputs: +*eight inputs: \n +*@li x:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_NZ. +*@li cont:A 1D Tensor. Must be one of the following types: float16. The format must be ND. +*@li x_static:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_NZ. +*@li h_0:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li w_xh:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li w_sh:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li w_hh:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li w_ho:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li bias_h:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND. +*@li bias_o:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND. + +*@par Attributes: +*@li expose_hidden:An bool identifying if expose the hidden state of last time step. Default to false. +*@li num_output:An integer identifying the number of output features. Default to 0. + +*@par Outputs: +*two outputs: \n +*@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li h_t:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*/ +REG_OP(RNN) + .INPUT(x, TensorType({DT_FLOAT16})) + .INPUT(cont, TensorType({DT_FLOAT16})) + .OPTIONAL_INPUT(x_static, TensorType({DT_FLOAT16})) + .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(w_xh, TensorType({DT_FLOAT16})) + .INPUT(w_sh, TensorType({DT_FLOAT16})) + .INPUT(w_hh, TensorType({DT_FLOAT16})) + .INPUT(w_ho, TensorType({DT_FLOAT16})) + .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(bias_o, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(expose_hidden, Bool, false) + .ATTR(num_output, Int, 0) + .OP_END_FACTORY_REG(RNN) + +/** +*@brief: BasicRNNCell operator. +*@par Inputs: +*eight inputs: \n +*@li x:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_NZ. +*@li cont:A 1D Tensor. Must be one of the following types: float16. The format must be ND. +*@li w_xh_x_static:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_NZ. +*@li h_0:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li w_xh:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li w_hh:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li w_ho:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z. +*@li bias_h:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND. +*@li bias_o:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND. + +*@par Attributes: +*@li expose_hidden:An bool identifying if expose the hidden state of last time step. Default to false. +*@li num_output:An integer identifying the number of output features. Default to 0. + +*@par Outputs: +*two outputs: \n +*@li o_t:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*@li h_t:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. +*/ +REG_OP(BasicRNNCell) + .INPUT(x, TensorType({DT_FLOAT16})) + .OPTIONAL_INPUT(cont, TensorType({DT_FLOAT16})) + .OPTIONAL_INPUT(w_xh_x_static, TensorType({DT_FLOAT16, DT_FLOAT})) + .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(w_xh, TensorType({DT_FLOAT16})) + .OPTIONAL_INPUT(w_hh, TensorType({DT_FLOAT16})) + .INPUT(w_ho, TensorType({DT_FLOAT16})) + .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(bias_o, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(o_t, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(expose_hidden, Bool, false) + .ATTR(num_output, Int, 0) + .OP_END_FACTORY_REG(BasicRNNCell) } // namespace ge #endif // GE_OP_RNN_H diff --git a/third_party/fwkacllib/inc/ops/roipooling_ops.h b/third_party/fwkacllib/inc/ops/roipooling_ops.h new file mode 100644 index 00000000..dd7a2213 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/roipooling_ops.h @@ -0,0 +1,78 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_ROIPOOLING_OPS_H_ +#define GE_OP_ROIPOOLING_OPS_H_ + +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Performs Region of Interest (ROI) pooling. + +*@par Inputs: +* Three inputs, including: +*@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature map. +*@li rois: A tensor of type float16 or float32, with shape [batch, 5, roi_max_num], describing the RIOs. +*@li roi_actual_num: A tensor of type int32, with shape [batch, 8], specifying the number of ROIs per batch. + +*@par Attributes: +*@li roi_max_num: An optional int32, specifying the maximum number of ROIs per batch, at most 6000. Defaults to "3008". The value must be a multiple of 16. +*@li pooled_h: A required int32, specifying the pooled H. Must be greater than 0. +*@li pooled_w: A required int32, specifying the pooled W. Must be greater than 0. +*@li spatial_scale: An optional scaling factor for mapping the input coordinates to the ROI coordinates. Defaults to "0.0625". + +*@par Outputs: +*y: An NC1HWC0 tensor of type float16 or float32, describing the result feature map. + +*@attention Constraints:\n +*@li For the feature map input: \n +(1) If pooled_h = pooled_w = 2, the feature map size must not exceed 50. \n +(2) If pooled_h = pooled_w = 3, the feature map size must not exceed 60. \n +(3) If pooled_h = pooled_w = 4, the feature map size must not exceed 70. \n +(4) If pooled_h = pooled_w = 5, the feature map size must not exceed 70. \n +(5) If pooled_h = pooled_w = 6, the feature map size must not exceed 80. \n +(6) If pooled_h = pooled_w = 7, the feature map size must not exceed 80. \n +(7) If pooled_h = pooled_w = 8, the feature map size must not exceed 80. \n +(8) If pooled_h = pooled_w = 9, the feature map size must not exceed 70. \n +(9) If pooled_h = pooled_w = 10, the feature map size must not exceed 70. \n +(10) If pooled_h = pooled_w = 11, the feature map size must not exceed 70. \n +(11) If pooled_h = pooled_w = 12, the feature map size must not exceed 70. \n +(12) If pooled_h = pooled_w = 13, the feature map size must not exceed 70. \n +(13) If pooled_h = pooled_w = 14, the feature map size must not exceed 70. \n +(14) If pooled_h = pooled_w = 15, the feature map size must not exceed 70. \n +(15) If pooled_h = pooled_w = 16, the feature map size must not exceed 70. \n +(16) If pooled_h = pooled_w = 17, the feature map size must not exceed 50. \n +(17) If pooled_h = pooled_w = 18, the feature map size must not exceed 40. \n +(18) If pooled_h = pooled_w = 19, the feature map size must not exceed 40. \n +(19) If pooled_h = pooled_w = 20, the feature map size must not exceed 40. \n +*/ + +REG_OP(RoiPooling) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(roi_actual_num, TensorType({DT_INT32})) + .ATTR(roi_max_num, Int,3008) + .REQUIRED_ATTR(pooled_h, Int) + .REQUIRED_ATTR(pooled_w, Int) + .ATTR(spatial_scale, Float, 0.0625) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .OP_END_FACTORY_REG(RoiPooling) + +} // namespace ge + +#endif // GE_OP_BITWISE_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/rpn_proposals.h b/third_party/fwkacllib/inc/ops/rpn_proposals.h new file mode 100644 index 00000000..3ebf7589 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/rpn_proposals.h @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #ifndef GE_OP_RPN_PROPOSALS_H + #define GE_OP_RPN_PROPOSALS_H + + #include "graph/operator_reg.h" + +namespace ge { +REG_OP(RpnProposals) + .INPUT(rois, TensorType({DT_FLOAT16})) + .INPUT(cls_bg_prob, TensorType({DT_FLOAT16})) + .INPUT(img_size, TensorType({DT_INT32})) + .REQUIRED_ATTR(score_threshold, Float) + .REQUIRED_ATTR(k, Int) + .REQUIRED_ATTR(min_size, Float) + .REQUIRED_ATTR(nms_threshold, Float) + .REQUIRED_ATTR(post_nms_num, Int) + .ATTR(score_filter, Bool, true) + .ATTR(box_filter, Bool, true) + .ATTR(score_sigmoid, Bool, false) + .OUTPUT(sorted_box, TensorType({DT_FLOAT16})) + .OP_END_FACTORY_REG(RpnProposals) + +REG_OP(RpnProposalsD) + .INPUT(rois, TensorType({DT_FLOAT16})) + .INPUT(cls_bg_prob, TensorType({DT_FLOAT16})) + .REQUIRED_ATTR(img_size, ListInt) + .REQUIRED_ATTR(score_threshold, Float) + .REQUIRED_ATTR(k, Int) + .REQUIRED_ATTR(min_size, Float) + .REQUIRED_ATTR(nms_threshold, Float) + .REQUIRED_ATTR(post_nms_num, Int) + .ATTR(score_filter, Bool, true) + .ATTR(box_filter, Bool, true) + .ATTR(score_sigmoid, Bool, false) + .OUTPUT(sorted_box, TensorType({DT_FLOAT16})) + .OP_END_FACTORY_REG(RpnProposalsD) +} // namespace ge + + #endif // GE_OP_GENERATE_RPN_PROPOSALS_H diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h new file mode 100644 index 00000000..3f1e938a --- /dev/null +++ b/third_party/fwkacllib/inc/ops/sdca_ops.h @@ -0,0 +1,86 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_SDCA_OPS_H +#define GE_OP_SDCA_OPS_H + +#include "graph/operator.h" +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for \n +*linear models with L1 + L2 regularization. As global optimization objective is \n +*strongly-convex, the optimizer optimizes the dual objective at each step. The \n +*optimizer applies each update one example at a time. Examples are sampled \n +*uniformly, and the optimizer is learning rate free and enjoys linear convergence \n +*rate. + +*@par Inputs: +*@li sparse_example_indices: a list of vectors which contain example indices. +*@li sparse_feature_indices: a list of vectors which contain feature indices. +*@li sparse_feature_values: a list of vectors which contains feature value associated with each feature group. +*@li dense_features: a list of matrices which contains the dense feature values. +*@li example_weights: a vector which contains the weight associated with each example. +*@li example_labels: a vector which contains the label/target associated with each example. +*@li sparse_indices: a list of vectors where each value is the indices which has \n +*corresponding weights in sparse_weights. This field maybe omitted for the dense approach. +*@li sparse_weights: a list of vectors where each value is the weight associated with a sparse feature group. +*@li dense_weights: a list of vectors where the values are the weights associated with a dense feature group. +*@li example_state_data: a list of vectors containing the example state data. +*@li loss_type: Type of the primal loss. Currently SdcaSolver supports logistic, squared and hinge losses. +*@li l1: Symmetric l1 regularization strength. +*@li l2: Symmetric l2 regularization strength. +*@li num_loss_partitions: Number of partitions of the global loss function. +*@li num_inner_iterations: Number of iterations per mini-batch. + +*@par Outputs: +*y: A Returns a list of vectors containing the updated example state \n +*data.a list of vectors where each value is the delta \n +*weights associated with a sparse feature group.a list of vectors where the values are the delta \n +*weights associated with a dense feature group. + +*/ + +REG_OP(SdcaOptimizerV2) + .DYNAMIC_INPUT(sparse_example_indices, TensorType({DT_INT64})) + .DYNAMIC_INPUT(sparse_feature_indices, TensorType({DT_INT64})) + .DYNAMIC_INPUT(sparse_feature_values, TensorType({DT_FLOAT})) + .DYNAMIC_INPUT(dense_features, TensorType({DT_FLOAT})) + .INPUT(example_weights, TensorType({DT_FLOAT})) + .INPUT(example_labels, TensorType({DT_FLOAT})) + .DYNAMIC_INPUT(sparse_indices, TensorType({DT_INT64})) + .DYNAMIC_INPUT(sparse_weights, TensorType({DT_INT64})) + .DYNAMIC_INPUT(dense_weights, TensorType({DT_FLOAT})) + .INPUT(example_state_data, TensorType({DT_FLOAT})) + .OUTPUT(out_example_state_data, TensorType({DT_FLOAT})) + .DYNAMIC_OUTPUT(out_delta_sparse_weights, TensorType({DT_FLOAT})) + .DYNAMIC_OUTPUT(out_delta_dense_weights, TensorType({DT_FLOAT})) + .ATTR(adaptive, Bool, false) + .ATTR(num_sparse_features, Int, 0) + .ATTR(num_sparse_features_with_values, Int, 0) + .ATTR(num_dense_features, Int, 0) + .ATTR(num_loss_partitions, Int, 1) + .ATTR(num_inner_iterations, Int, 1) + .ATTR(loss_type, String, "logistic_loss") + .ATTR(l1, Float, 0.5) + .ATTR(l2, Float, 0.5) + .OP_END_FACTORY_REG(SdcaOptimizerV2) + +} // namespace ge + +#endif //GE_OP_SDCA_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h index 5b083282..dab71025 100644 --- a/third_party/fwkacllib/inc/ops/selection_ops.h +++ b/third_party/fwkacllib/inc/ops/selection_ops.h @@ -123,20 +123,20 @@ REG_OP(TileD) .OP_END_FACTORY_REG(TileD) /** -* @brief Gather slices from "params" into a tensor with shape specified by\n +* @brief Gather slices from "x" into a tensor with shape specified by\n * "indices". "indices" is an K-dimensional integer tensor, best thought of as a\n * (K-1)-dimensional tensor of "indices" into "params", where each element\n * defines a slice of "params":\n * output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]]\n -* In gather_nd, "indices" defines slices into the first N dimensions of\n +* "indices" defines slices into the first N dimensions of\n * "params", where\n * N = indices.shape[-1]\n * indices = [[0, 0], [1, 1]]\n -* params = [['a', 'b'], ['c', 'd']]\n +* x = [['a', 'b'], ['c', 'd']]\n * output = ['a', 'd']\n * @par Inputs: -* @li params: A Tensor of type BasicType. +* @li x: A Tensor of type BasicType. * @li indices: A Tensor of type IndexNumberType. * @par Outputs: @@ -144,12 +144,12 @@ REG_OP(TileD) * @see GatherNd() * @attention Constraints: -* @li "params" is one of the following types: float16, float32, int32, int8, +* @li "x" is one of the following types: float16, float32, int32, int8, * uint8. */ REG_OP(GatherNd) - .INPUT(x1, TensorType::BasicType()) - .INPUT(x2, TensorType::IndexNumberType()) + .INPUT(x, TensorType::BasicType()) + .INPUT(indices, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(GatherNd) @@ -289,9 +289,9 @@ REG_OP(StridedSliceD) DT_BOOL})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_UINT8, DT_INT8, DT_BOOL})) - .ATTR(begin, ListInt, {}) - .ATTR(end, ListInt, {}) - .ATTR(strides, ListInt, {}) + .REQUIRED_ATTR(begin, ListInt) + .REQUIRED_ATTR(end, ListInt) + .REQUIRED_ATTR(strides, ListInt) .ATTR(begin_mask, Int, 0) .ATTR(end_mask, Int, 0) .ATTR(ellipsis_mask, Int, 0) @@ -337,10 +337,10 @@ REG_OP(StridedSliceD) REG_OP(StridedSliceGradD) .INPUT(dy, TensorType::BasicType()) .OUTPUT(output, TensorType::BasicType()) - .ATTR(shape, ListInt, {}) - .ATTR(begin, ListInt, {}) - .ATTR(end, ListInt, {}) - .ATTR(strides, ListInt, {}) + .REQUIRED_ATTR(shape, ListInt) + .REQUIRED_ATTR(begin, ListInt) + .REQUIRED_ATTR(end, ListInt) + .REQUIRED_ATTR(strides, ListInt) .ATTR(begin_mask, Int, 0) .ATTR(end_mask, Int, 0) .ATTR(ellipsis_mask, Int, 0) @@ -473,7 +473,7 @@ REG_OP(ReverseV2) *@par Inputs: * One input: *@li x: An ND Tensor (up to 8D). \n -*Must be one of the following types: int8, uint8, int16, uint16, int32, int64, bool, float32, double +*Must be one of the following types: int8, uint8, int16, uint16, int32, int64, bool, float32, float64 *@par Attributes: *axis: The indices of the dimensions to reverse. @@ -484,7 +484,7 @@ REG_OP(ReverseV2) *@attention Constraints: "axis" must be within the rank of "x". */ -REG_OP(ReverseExt2) +REG_OP(ReverseV2D) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_STRING})) @@ -492,7 +492,7 @@ REG_OP(ReverseExt2) DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_STRING})) .REQUIRED_ATTR(axis, ListInt) - .OP_END_FACTORY_REG(ReverseExt2) + .OP_END_FACTORY_REG(ReverseV2D) /** *@brief: Selects elements from "x1" or "x2", depending on "condition". @@ -513,6 +513,27 @@ REG_OP(Select) .OUTPUT(y,TensorType::BasicType()) .OP_END_FACTORY_REG(Select) +/** +*@brief: SelectV2s elements from "x2" or "x3", depending on "condition". + +*@par Inputs: +* Three inputs, including: +* @li x1: A Tensor of type bool. +* @li x2: A Tensor. Must be one of the following types: float16, float32, int32, int8, uint8. +* @li x3: A Tensor of the same type as "x2". + +*@par Outputs: +*y: A Tensor. Has the same type as "x2". + +*/ +REG_OP(SelectV2) + .INPUT(x1, TensorType({DT_BOOL})) + .INPUT(x2,TensorType::BasicType()) + .INPUT(x3,TensorType::BasicType()) + .OUTPUT(y,TensorType::BasicType()) + .OP_END_FACTORY_REG(SelectV2) + + /** *@brief: Computes the maximum along segments of a tensor. *Computes a tensor such that output[i]=(data[i]) where max is over j such that segment_ids[j] == i. @@ -621,7 +642,7 @@ REG_OP(OneHotD) *@par Inputs: *@li x: A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64. -*@li begin: A Tensor of type int32 or int64. The starting location for the slice. +*@li offsets: A Tensor of type int32 or int64. The starting location for the slice. *@li size: A Tensor of type int32 or int64. The tensor shape. *@par Outputs: @@ -629,7 +650,7 @@ REG_OP(OneHotD) */ REG_OP(Slice) .INPUT(x, TensorType::BasicType()) - .INPUT(begin, TensorType::IndexNumberType()) + .INPUT(offsets, TensorType::IndexNumberType()) .INPUT(size, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(Slice) @@ -642,7 +663,7 @@ REG_OP(Slice) *x: A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64. *@par Attributes: -*@li begin: The starting location for the slice. +*@li offsets: The starting location for the slice. *@li size: The tensor shape. *@par Outputs: @@ -651,8 +672,8 @@ REG_OP(Slice) REG_OP(SliceD) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) - .REQUIRED_ATTR(begin,ListInt) - .REQUIRED_ATTR(size,ListInt) + .REQUIRED_ATTR(offsets, ListInt) + .REQUIRED_ATTR(size, ListInt) .OP_END_FACTORY_REG(SliceD) /** @@ -660,15 +681,20 @@ REG_OP(SliceD) * dimension. * @par Inputs: -* @li input: A 1D or higher tensor of type float16, with the last dimension at +* @li x: A 1D or higher tensor of type float16, with the last dimension at * least "k". * Specifies the data to sort. * @li assist_seq: A 1D tensor of type float16. * With values 0, 1, 2, ..., N-1, where "N" is the last dimension. * @par Attributes: -* k: An int that is at least 0, specifying the number of top elements to look\n -* for along the last dimension (along each row for matrices). +* @li k: A required int that is at least 0, specifying the number of top elements \n +* to look for along the last dimension (along each row for matrices). +* @li sorted: An optional bool. Defaults to true.\n +* If true, the resulting "k" elements will be sorted by the values in descending +* order. +* @li dim: An optional int. Defaults to -1. For reserved use.\n +* @li largest: An optional bool. Defaults to true. For reserved use.\n * @par Outputs: * @li values: A Tensor, specifying the sorted data. Has the same type as "input". @@ -677,23 +703,24 @@ REG_OP(SliceD) * @attention Constraints: * @li k =< 4096 * @li Size of the last dimension =< 65500 - -* @see TopKV2() */ -REG_OP(TopK) - .INPUT(input, TensorType::RealNumberType()) +REG_OP(TopKD) + .INPUT(x, TensorType::RealNumberType()) .INPUT(assist_seq, TensorType({DT_FLOAT16})) .OUTPUT(values, TensorType::RealNumberType()) .OUTPUT(indices, TensorType({DT_INT32})) - .ATTR(k, Int, 0) - .OP_END_FACTORY_REG(TopK) + .REQUIRED_ATTR(k, Int) + .ATTR(sorted, Bool, true) + .ATTR(dim, Int, -1) + .ATTR(largest, Bool, true) + .OP_END_FACTORY_REG(TopKD) /** * @brief Finds values and indices of the "k" largest elements for the last * dimension. * @par Inputs: -* @li input: A 1D or higher tensor of type BasicType, with the last dimension +* @li x: A 1D or higher tensor of type BasicType, with the last dimension * at least "k". * @li k: A 0D Tensor of type int32.\n * Number of top elements to look for along the last dimension (along each row @@ -712,32 +739,31 @@ REG_OP(TopK) * @see TopK() */ -REG_OP(TopKV2) - .INPUT(input, TensorType::RealNumberType()) +REG_OP(TopK) + .INPUT(x, TensorType::RealNumberType()) .INPUT(k, TensorType({DT_INT32})) .OUTPUT(values, TensorType::RealNumberType()) .OUTPUT(indices, TensorType({DT_INT32})) .ATTR(sorted, Bool, true) - .ATTR(T, Int, 0) - .OP_END_FACTORY_REG(TopKV2) + .OP_END_FACTORY_REG(TopK) /** *@brief Creates a new tensor by applying sparse "updates" to individual values or slices within a tensor (initially zero for numeric, empty for string) of the given "shape" according to "indices". *@par Inputs: *Inputs including: \n * @li indices: A required index tensor. Must be one of the following types: float32, float16, int32, int8, uint8. -* @li updates: A required slice tensor. Must be one of the following types: float32, float16, int32, int8, uint8. +* @li x: A required slice tensor. Must be one of the following types: float32, float16, int32, int8, uint8. * @li shape: A required list of int32, specifying the output shape. *@par Outputs: *y:A output Tensor with same datatype as "updates". *@attention Constraints:\n *@li "y" has the same shape as "shape". -*@li "y" has the same type as "updates". +*@li "y" has the same type as "x". */ REG_OP(ScatterNd) .INPUT(indices, TensorType::BasicType()) - .INPUT(updates, TensorType::BasicType()) + .INPUT(x, TensorType::BasicType()) .INPUT(shape, TensorType::IndexNumberType()) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(ScatterNd) @@ -747,7 +773,7 @@ REG_OP(ScatterNd) *@par Inputs: *Inputs including: \n * @li indices: A required index tensor. Must be one of the following types: float32, float16, int32, int8, uint8. -* @li updates: A required slice tensor. Must be one of the following types: float32, float16, int32, int8, uint8. +* @li x: A required slice tensor. Must be one of the following types: float32, float16, int32, int8, uint8. *@par Attributes: * @li shape: A required list of int32, specifying the output shape. *@par Outputs: @@ -755,48 +781,48 @@ REG_OP(ScatterNd) *@attention Constraints:\n *@li "y" has the same shape as "shape". -*@li "y" has the same type as "updates". +*@li "y" has the same type as "x". */ REG_OP(ScatterNdD) .INPUT(indices, TensorType::IndexNumberType()) - .INPUT(updates, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16})) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16})) - .ATTR(shape, ListInt,{}) + .REQUIRED_ATTR(shape, ListInt) .OP_END_FACTORY_REG(ScatterNdD) /** -* @brief Says whether the targets are in the top "k" predictions.\n +* @brief Says whether the targets are in the top "k" predictions. * @par Inputs: * Three inputs, including: * @li x1: A 2D Tensor of type float32. A "batch_size * classes" tensor. * @li x2: A 1D Tensor of type IndexNumberType. A batch_size tensor of class ids. -* @li k: A 1D Tensor of the same type as "x2". -* Specifies the number of top elements to look at for computing precision. + +* @par Attributes: +* @li k: A required int32, specifying the number of top elements to look at for +* computing precision. * @par Outputs: * y: A Tensor of type bool. * @see InTopK() */ -REG_OP(InTopKExt2) +REG_OP(InTopKD) .INPUT(x1, TensorType({DT_FLOAT})) .INPUT(x2, TensorType({IndexNumberType})) - .INPUT(k, TensorType({IndexNumberType})) .OUTPUT(y, TensorType({DT_BOOL})) - .OP_END_FACTORY_REG(InTopKExt2) + .REQUIRED_ATTR(k, Int) + .OP_END_FACTORY_REG(InTopKD) /** -* @brief Says whether the targets are in the top "k" predictions\n +* @brief Says whether the targets are in the top "k" predictions. * @par Inputs: * Two inputs, including: * @li x1: A 2D Tensor of type float32. A "batch_size * classes" tensor. * @li x2: A 1D Tensor of type IndexNumberType. A batch_size tensor of class ids. - -* @par Attributes: -* @li k: An optional int32, specifying the number of top elements to look at for -* computing precision. +* @li k: A 1D Tensor of the same type as "x2". +* Specifies the number of top elements to look at for computing precision. * @par Outputs: * y: A Tensor of type bool. @@ -804,7 +830,7 @@ REG_OP(InTopKExt2) REG_OP(InTopK) .INPUT(x1, TensorType({DT_FLOAT})) .INPUT(x2, TensorType(IndexNumberType)) - .ATTR(k, Int, 1) + .INPUT(k, TensorType({IndexNumberType})) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(InTopK) @@ -890,9 +916,9 @@ REG_OP(StridedSliceAssignD) .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32})) .INPUT(input_value, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32})) .OUTPUT(var, TensorType(BasicType)) - .ATTR(begin, ListInt, {}) - .ATTR(end, ListInt, {}) - .ATTR(strides, ListInt, {}) + .REQUIRED_ATTR(begin, ListInt) + .REQUIRED_ATTR(end, ListInt) + .REQUIRED_ATTR(strides, ListInt) .ATTR(begin_mask, Int, 0) .ATTR(end_mask, Int, 0) .ATTR(ellipsis_mask, Int, 0) @@ -1221,7 +1247,7 @@ REG_OP(UnsortedSegmentMin) * @li k: A Tensor. * @par Attributes: -* num_segments: An int32, specifying the number of distinct segment IDs. +* num_segments: A required int32, specifying the number of distinct segment IDs. * @par Outputs: * y: A Tensor of type RealNumberType. @@ -1283,7 +1309,393 @@ REG_OP(UnsortedSegmentProdD) .OP_END_FACTORY_REG(UnsortedSegmentProdD) /** -*@brief Crops the input. +*@brief Normalizes data. It is called Region on YOLO v2 and Yolo on YOLO v3. + +*@par Inputs: +*x: An NCHW tensor of type float16 or float32. The data is with shape (N, boxes*(coords+obj+classes), H, W),where, "obj" indicates the confidence of an object, and only one confidence is supported. Boxes are arranged as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn. + +*@par Attributes: +*@li boxes: A required int32, specifying the number of anchor boxes. Defaults to "5" for V2 or "3" for V3. +*@li coords: An int32, specifying the number of parameters required for locating an object. The value is fixed at "4", corresponding to (x,y,w,h). +*@li classes: An int32, specifying the number of prediction classes. Defaults to "80". The value range is [1, 1024]. +*@li yolo_version: A string, specifying the YOLO version, either "V2" or "V3". +*@li softmax: A bool, specifying whether to perform softmax, valid only when "yolo_version = V2". +*@li background: A bool, specifying the operation types of the obj and classes, used in conjunction with "softmax" and valid only when "yolo_version = V2". +*@li background: A bool. + +*@par Outputs: +*@li coord_data: A float16 or float32 with shape [N, boxes*coords, ceilx(height*width*2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box. +*@li obj_prob: A float16 or float32 with shape [N, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence. +*@li classes_prob: A float16 or float32 with shape [N, classes, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes. + +*@attention Constraints: +*@li This operator applies to YOLO v2 and v3 networks. +*@li The succeeding layer of the Yolo operator must be operator Yolov3DetectionOutput. +*/ +REG_OP(Yolo) + .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .ATTR(boxes, Int, 3) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(yolo_version, String, "V3") + .ATTR(softmax, Bool, false) + .ATTR(background, Bool, false) + .ATTR(softmaxtree, Bool, false) + .OP_END_FACTORY_REG(Yolo) + +/** +*@brief Performs YOLO V3 detection. + +*@par Inputs: +*Ten inputs, including: +*@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class". \n +There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +* +*@par Attributes: +*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. + +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. + +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n + +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li This operator applies only to the YOLO v3 network. +*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators. + +*@see Yolo() +*/ +REG_OP(YoloV3DetectionOutput) + .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases_low, ListFloat) + .REQUIRED_ATTR(biases_mid, ListFloat) + .REQUIRED_ATTR(biases_high, ListFloat) + .ATTR(boxes, Int, 3) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV3DetectionOutput) + +/** +*@brief Performs YOLO V3 detection. + +*@par Inputs: +*16 Input, including: +*@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput. \n +A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively. + +*@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively. + +* +*@par Attributes: +*@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li This operator applies only to the YOLO v3 network. +*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators. +*@see Yolo() +*/ +REG_OP(YoloV3DetectionOutputD) + .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex3, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex3, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases_low, ListFloat) + .REQUIRED_ATTR(biases_mid, ListFloat) + .REQUIRED_ATTR(biases_high, ListFloat) + .ATTR(boxes, Int, 3) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV3DetectionOutputD) + +/** +*@brief Performs object detection. + +*@par Inputs: +*@li cls_prob: An NCHW tensor of type float16 or float32, specifying the probability of the proposal is the background class. +*@li bbox_pred: An NCHW tensor of type float16 or float32, specifying the coordinates of the proposals bounding boxes. + +*@par Attributes: +*@li im_info: A required list of floats, specifying the Image information. The value range is [1, 4096]. +*@li feat_stride: A required float32, specifying the stride of the sliding window. Must be greater than "0". Defaults to "16". +*@li base_size: A required float32, specifying the size of the generated base box. Must be greater than "0". Defaults to "16". +*@li min_size: A required float32, specifying the minimum edge length of a proposal. A box with any edge less than this value is removed. Must be greater than "0". Defaults to "16". +*@li ratio: A required list of floats, specifying the aspect ratio of the generated base box. Defaults to [0.5, 1, 2]. +*@li scale: A required list of floats, specifying the ratio of the size of the generated base box to "base_size". Defaults to [8, 16, 32]. +*@li pre_nms_topn: A required int, specifying top K boxes before NMS. For float16 input, pre_nms_topn <= 6000. For float32 input, pre_nms_topn <= 3000. Defaults to "3000". +*@li post_nms_topn: A required int, specifying the number of boxes to be output after NMS. The value is a multiple of 16. For float16 input, post_nms_topn <= 6000. For float32 input, post_nms_topn <= 3000 (the maximum multiple of 16 is 2992 within the range). Defaults to "304". +*@li nms_thresh: A required float32, specifying the NMS threshold. The value range is (0,1]. Defaults to "0.7". + +*@par Outputs: +*@li rois: A Tensor with shape [batch, 5, post_nms_topn], of type float16, specifying the output box information. "post_nms_topn" must be a multiple of 16. The dimension "5" indicates (batchID, x1, y1, x2, y2). The number of BBoxes output per batch is determined by "actual_rois_num". +*@li actual_rois_num: A Tensor with shape [batch, 8], of type int32, specifying the number of BBoxes output per batch. +*/ + REG_OP(Proposal) + .INPUT(cls_prob, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(bbox_pred, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(actual_rois_num, TensorType({DT_INT32})) + .ATTR(im_info, ListFloat, {375, 1240}) + .ATTR(feat_stride, Float, 16) + .ATTR(base_size, Float, 16) + .ATTR(min_size, ListFloat, {16, 16}) + .ATTR(ratio, ListFloat, {0.5, 1, 2}) + .ATTR(scale, ListFloat, {8, 16, 32}) + .ATTR(pre_nms_topn, Int, 6000) + .ATTR(post_nms_topn, Int, 304) + .ATTR(nms_thresh, Float, 0.7) + .OP_END_FACTORY_REG(Proposal) + +/** +*@brief Performs object detection. Different from Proposal, this is an internal API called after FE fusion and has an additional "rpn_bbox" attribute. The suffix "D" in the API name will be removed from the generated model. + +*@par Inputs: +*@li cls_prob: An NCHW tensor of type float16, specifying the probability of the proposal is the background class. +*@li bbox_pred: An NCHW tensor of type float16, specifying the coordinates of the proposals bounding boxes. +*@li rpn_bbox: An NCHW tensor of type float16, specifying the coordinates of the proposals bounding boxes. + +*@par Attributes: +*@li im_info: A required list of floats, specifying the Image information. The value range is [1, 4096]. +*@li feat_stride: A required float32, specifying the stride of the sliding window. Must be greater than "0". Defaults to "16". +*@li base_size: A required float32, specifying the size of the generated base box. Must be greater than "0". Defaults to "16". +*@li min_size: A required float32, specifying the minimum edge length of a proposal. A box with any edge less than this value is removed. Must be greater than "0". Defaults to "16". +*@li ratio: A required list of floats, specifying the aspect ratio of the generated base box. Defaults to [0.5, 1, 2]. +*@li scale: A required list of floats, specifying the ratio of the size of the generated base box to "base_size". Defaults to [8, 16, 32]. +*@li pre_nms_topn: A required int, specifying top K boxes before NMS. For float16 input, pre_nms_topn <= 6000. For float32 input, pre_nms_topn <= 3000. Defaults to "3000". +*@li post_nms_topn: A required int, specifying the number of boxes to be output after NMS. The value is a multiple of 16. For float16 input, post_nms_topn <= 6000. For float32 input, post_nms_topn <= 3000 (the maximum multiple of 16 is 2992 within the range). Defaults to "304". +*@li nms_thresh: A required float32, specifying the NMS threshold. The value range is (0,1]. Defaults to 0.7. + +*@par Outputs: +*@li rois: A Tensor with shape [batch, 5, post_nms_topn], of type float16, specifying the output box information. "post_nms_topn" must be a multiple of 16. The dimension "5" indicates (batchID, x1, y1, x2, y2). The number of BBoxes output per batch is determined by "actual_rois_num". +*@li actual_rois_num: A Tensor with shape [batch, 8], of type int32, specifying the number of BBoxes output per batch. +*/ +REG_OP(ProposalD) + .INPUT(cls_prob, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(bbox_pred, TensorType({DT_FLOAT16, DT_FLOAT})) + .INPUT(rpn_bbox, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(actual_rois_num, TensorType({DT_INT32})) + .ATTR(im_info, ListFloat, {375, 1240}) + .ATTR(feat_stride, Float, 16) + .ATTR(base_size, Float, 16) + .ATTR(min_size, ListFloat, {16, 16}) + .ATTR(ratio, ListFloat, {0.5, 1, 2}) + .ATTR(scale, ListFloat, {8, 16, 32}) + .ATTR(pre_nms_topn, Int, 6000) + .ATTR(post_nms_topn, Int, 304) + .ATTR(nms_thresh, Float, 0.7) + .OP_END_FACTORY_REG(ProposalD) + +/** +*@brief Performs YOLO V2 detection. + +*@par Inputs: +* Four inputs, including: +*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov3DetectionOutput. \n +Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +* +*@par Attributes: +*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. + +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li This operator applies only to the YOLO v2 network. +*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator. + +*@see Yolo() +*/ +REG_OP(YoloV2DetectionOutput) + .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases, ListFloat) + .ATTR(boxes, Int, 5) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV2DetectionOutput) + +/** +*@brief Performs YOLO V2 detection. + +*@par Inputs: +*Six inputs, including: +*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov2DetectionOutput. \n +Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +*@li windex: A windex tensor with shape [height, weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed. \n + +*@li hindex: A hindex tensor with shape [height, weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]]. \n + +* +*@par Attributes: +*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. + +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. +* +*@attention Constraints:\n +*@li This operator applies only to the YOLO v2 network. +*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator. + +*@see Yolo() +*/ +REG_OP(YoloV2DetectionOutputD) + .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases, ListFloat) + .ATTR(boxes, Int, 5) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV2DetectionOutputD) + +/** +*@brief Performs plane or channel conversion on YoloV2. +* If reverse=true: (N, H, W, C)->(N, H*stride, W*stride, C/(stride*stride)) +* If reverse=false: (N, H, W, C)->(N, H/stride, W/stride, C*(stride*stride)) + +*@par Inputs: +*x: An (N, H, W, C) tensor. All data types are supported. + +*@par Attributes: +*@li stride: An optional int32, specifying the plane or channel scaling factor. Defaults to "2". +*@li reverse: An optional bool, specifying the conversion mode. If "true", depth to space conversion is performed. If "false", space to depth conversion is performed. Defaults to "false". + +*@par Outputs: +*y: An (N, H, W, C) tensor. All data types are supported. + +*@attention Constraints: +*@li If reverse=true: C/(stride*stride) yields an integer result. If reverse=false: W/stride and H/stride yield integer results. +*/ +REG_OP(PassThrough) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64})) + .ATTR(stride, Int, 2) + .ATTR(reverse, Bool, false) + .OP_END_FACTORY_REG(PassThrough) + +/** +*@brief Crops the input tensor x to the shape of size. For example: \n +*(1) x: bottom to be cropped, with shape (20, 50, 512, 512);\n +*(2) size: reference input for cropping, with shape (20, 10, 256, 256);\n +*(3) axis = 1;\n +*(4) offset = (25, 128, 128);\n +*(5) y = x[:, 25:25 + size.shape[1], 128:128 + size.shape[2], 128:128 + size.shape[3]]. *@par Inputs: *Inputs include: \n @@ -1305,8 +1717,72 @@ REG_OP(Crop) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_UINT32,DT_INT64,DT_UINT64})) .INPUT(size, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_UINT32,DT_INT64,DT_UINT64})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_UINT32,DT_INT64,DT_UINT64})) - .ATTR(axis, Int, 2) - .REQUIRED_ATTR(offsets, ListInt) + .ATTR(axis, Int, 2) + .REQUIRED_ATTR(offsets, ListInt) .OP_END_FACTORY_REG(Crop) + +/** +*@brief Extends the input with copies of data along a specified dimension. For example: \n +*(1) If x = [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], with shape (2, 3, 2);\n +*(2) axis = 1;\n +*(3) tiles = 2;\n +*(4) Then, y = [[[1, 2], [3, 4], [5, 6], [1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12], [7, 8], [9, 10], [11, 12]]], with shape (2, 6, 2). + +*@par Inputs: +* One input: +*input_x: A Tensor with any format. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64. + +*@par Attributes: +*@li axis: An optional int32, specifying the axis to tile. Defaults to 1. +*@li tiles: A required int32, specifying the number of copies (tiles) to output. + +*@par Outputs: +*output_y: A Tensor of any format. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64. + +*@attention Constraints:\n +*@li "axis" must be within the rank of the input tensor. +*@li "tiles" must be greater than 1. +*/ +REG_OP(TileWithAxis) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT64, DT_INT32, + DT_INT16, DT_INT8, DT_UINT64, DT_UINT32, DT_UINT16, DT_UINT8})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT64, DT_INT32, + DT_INT16, DT_INT8, DT_UINT64, DT_UINT32, DT_UINT16, DT_UINT8})) + .ATTR(axis, Int, 1) + .REQUIRED_ATTR(tiles, Int) + .OP_END_FACTORY_REG(TileWithAxis) + +/** +*@brief Read data with offset and stride. + +*@par Inputs: +*One input:\n +*x: A Tensor. Must be one of the following types: float16, int8. + +*@par Attributes: +*@li stride_list: An optional 5D list of type int32. Defaults to "[1,1,1,1,1]". + +*@par Outputs: +*y: A Tensor of the same type as "x". +*/ +REG_OP(ReadSelect) + .INPUT(x, TensorType::ALL()) + .OUTPUT(y, TensorType::ALL()) + .ATTR(stride_list, ListInt, {1,1,1,1,1}) + .OP_END_FACTORY_REG(ReadSelect) + +/** +*@brief: Write data with offset. + +*@par Inputs:\n +*x: A Tensor. Must be one of the following types: int32, float32, float16, int8. + +*@par Outputs: +*y: A Tensor. Has the same type as "x". +*/ +REG_OP(WriteSelect) + .INPUT(x, TensorType::ALL()) + .OUTPUT(y, TensorType::ALL()) + .OP_END_FACTORY_REG(WriteSelect) } // namespace ge #endif // GE_OP_SELECTION_OPS_H diff --git a/third_party/fwkacllib/inc/ops/set_ops.h b/third_party/fwkacllib/inc/ops/set_ops.h index 8b4ca579..dc9bc5c9 100644 --- a/third_party/fwkacllib/inc/ops/set_ops.h +++ b/third_party/fwkacllib/inc/ops/set_ops.h @@ -22,6 +22,27 @@ namespace ge { +/** +*@brief Applies set operation along last dimension of 2 Tensor inputs. + +*@par Inputs: +*Inputs include: \n +* @li x1: A Tensor. Must be one of the following types: int8, int16, int32, int64, uint8, uint16, string. +* @li x2: A Tensor. Must have the same type as x1. + +*@par Attributes: +*@li set_operation: A string. +*@li validate_indices: An optional bool. Defaults to True. + +*@par Outputs: +*@li y_indices: A Tensor of type int64. +*@li y_values: A Tensor. Has the same type as x1. +*@li y_shape: A Tensor of type int64. + +*@attention Constraints:\n +*-The implementation for DenseToDenseSetOperation on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(DenseToDenseSetOperation) .INPUT(x1, TensorType({DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, \ DT_INT32, DT_INT64, DT_STRING})) @@ -35,6 +56,29 @@ REG_OP(DenseToDenseSetOperation) .ATTR(validate_indices, Bool, true) .OP_END_FACTORY_REG(DenseToDenseSetOperation) +/** +*@brief Applies set operation along last dimension of Tensor and SparseTensor. + +*@par Inputs: +*Inputs include: \n +* @li x1: A Tensor. Must be one of the following types: int8, int16, int32, int64, uint8, uint16, string. +* @li x2_indices: A Tensor of type int64. 2D Tensor, indices of a SparseTensor. +* @li x2_values: A Tensor. Must have the same type as set1. 1D Tensor, values of a SparseTensor. +* @li x2_shape: A Tensor of type int64. 1D Tensor, shape of a SparseTensor. + +*@par Attributes: +*@li set_operation: A string. +*@li validate_indices: An optional bool. Defaults to True. + +*@par Outputs: +*@li y_indices: A Tensor of type int64. +*@li y_values: A Tensor. Has the same type as x1. +*@li y_shape: A Tensor of type int64. + +*@attention Constraints:\n +*-The implementation for DenseToSparseSetOperation on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(DenseToSparseSetOperation) .INPUT(x1, TensorType({DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, \ DT_INT32, DT_INT64, DT_STRING})) @@ -50,6 +94,32 @@ REG_OP(DenseToSparseSetOperation) .ATTR(validate_indices, Bool, true) .OP_END_FACTORY_REG(DenseToSparseSetOperation) +/** +*@brief Applies set operation along last dimension of 2 SparseTensor inputs. + +*@par Inputs: +*Inputs include: \n +* @li x1_indices: A Tensor of type int64. 2D Tensor, indices of a SparseTensor. +* @li x1_values: A Tensor. Must be one of the following types: int8, int16, \n + int32, int64, uint8, uint16, string. 1D Tensor, values of a SparseTensor. +* @li x1_shape: A Tensor of type int64. 1D Tensor, shape of a SparseTensor. +* @li x2_indices: A Tensor of type int64. 2D Tensor, indices of a SparseTensor. +* @li x2_values: A Tensor. Must have the same type as set1_values. 1D Tensor, values of a SparseTensor. +* @li x2_shape: A Tensor of type int64. 1D Tensor, shape of a SparseTensor. + +*@par Attributes: +*@li set_operation: A string. +*@li validate_indices: An optional bool. Defaults to True. + +*@par Outputs: +*@li y_indices: A Tensor of type int64. +*@li y_values: A Tensor. Has the same type as x1_values. +*@li y_shape: A Tensor of type int64. + +*@attention Constraints:\n +*-The implementation for SparseToSparseSetOperation on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(SparseToSparseSetOperation) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x1_values, TensorType({DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, \ @@ -67,10 +137,29 @@ REG_OP(SparseToSparseSetOperation) .ATTR(validate_indices, Bool, true) .OP_END_FACTORY_REG(SparseToSparseSetOperation) +/** +*@brief Number of unique elements along last dimension of input set. + +*@par Inputs: +*Inputs include: \n +* @li set_indices: A Tensor of type int64. 2D Tensor, indices of a SparseTensor. +* @li set_values: A Tensor. Must be one of the following types: int8, int16, int32, int64, uint8, uint16. +* @li set_shape: A Tensor of type int64. 1D Tensor, shape of a SparseTensor. + +*@par Attributes: +*validate_indices: An optional bool. Defaults to True. + +*@par Outputs: +*size: A Tensor of type int32. + +*@attention Constraints:\n +*-The implementation for SetSize on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(SetSize) .INPUT(set_indices, TensorType({DT_INT64})) .INPUT(set_values, TensorType({DT_INT8, DT_INT16, \ - DT_UINT8, DT_UINT16, DT_INT32, DT_INT64})) + DT_UINT8, DT_UINT16, DT_INT32, DT_INT64, DT_STRING})) .INPUT(set_shape, TensorType({DT_INT64})) .OUTPUT(size, TensorType({DT_INT32})) .ATTR(validate_indices, Bool, true) diff --git a/third_party/fwkacllib/inc/ops/sparse_ops.h b/third_party/fwkacllib/inc/ops/sparse_ops.h index 246fbc9b..87f44a54 100644 --- a/third_party/fwkacllib/inc/ops/sparse_ops.h +++ b/third_party/fwkacllib/inc/ops/sparse_ops.h @@ -21,6 +21,19 @@ namespace ge { +/** +*@brief Applies softmax to a batched ND SparseTensor. + +*@par Inputs: +*The input must be a batched ND SparseTensor. +* @li indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li values: A vector Tensor of type float or double. 1D. The values of the SparseTensor. +* @li shape: A vector Tensor of type int64. 1D. The shape of the SparseTensor. + +*@par Outputs: +*y: A vector Tensor. 1D. Has the same type as "values". + +*/ REG_OP(SparseSoftmax) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_FLOAT, DT_DOUBLE})) @@ -28,26 +41,70 @@ REG_OP(SparseSoftmax) .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE})) .OP_END_FACTORY_REG(SparseSoftmax) +/** +*@brief Adds up a SparseTensor and a dense Tensor, producing a dense Tensor. + +*@par Inputs: +*Inputs "x1_*" must be SparseTensors and "x2" must be a dense Tensor. +* @li x1_indices: A matrix Tensor of type int32 or int64. 2D. The indices of the SparseTensor. +* @li x1_values: The values of the SparseTensor. A vector Tensor. 1D. +* @li x1_shape: A vector Tensor of type int32 or int64. 1D. The shape of the SparseTensor. +* @li x2: A matrix Tensor. Has the same type and same shape as the SparseTensors. + +*@par Outputs: +*y: A matrix Tensor. Has the same type and same shape as "x2". + +*/ + REG_OP(SparseTensorDenseAdd) .INPUT(x1_indices, TensorType({DT_INT32, DT_INT64})) - .INPUT(x1_values, TensorType({DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT})) + .INPUT(x1_values, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, DT_UINT8, DT_INT8, \ + DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_shape, TensorType({DT_INT32, DT_INT64})) - .INPUT(x2, TensorType({DT_INT64})) - .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT})) + .INPUT(x2, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, DT_UINT8, DT_INT8, \ + DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) + .OUTPUT(y, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, DT_UINT8, DT_INT8, \ + DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(SparseTensorDenseAdd) +/** +*@brief Reorders a SparseTensor into the canonical, row-major ordering. + +*@par Inputs: +* @li indices: A matrix Tensor of type int32 or int64. 2D. The indices of the SparseTensor. +* @li values: Values of the SparseTensor. A vector Tensor. 1D. +* @li shape: A vector Tensor of type int32 or int64. 1D. The shape of the SparseTensor. + +*@par Outputs: +*@li y_indices: The indices of the SparseTensor. Has the same type as "indices". +*@li y_values: The values of the SparseTensorr. Has the same type as "values". + +*/ REG_OP(SparseReorder) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(shape, TensorType({DT_INT64})) .OUTPUT(y_indices, TensorType({DT_INT64})) .OUTPUT(y_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OP_END_FACTORY_REG(SparseReorder) +/** +*@brief Reshapes a SparseTensor to represent values in a new dense shape. + +*@par Inputs: +* @li indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li shape: A vector Tensor of type int64. 1D. The shape of the SparseTensor. +* @li new_shape: A 1D Tensor of type int64. The requested new dense shape. + +*@par Outputs: +*@li y_indices: A Tensor of type int64. The indices of the new dense shape. +*@li y_shape: A Tensor of type int64. The shape of the new dense shape. + +*/ REG_OP(SparseReshape) .INPUT(indices, TensorType({DT_INT64})) .INPUT(shape, TensorType({DT_INT64})) @@ -56,104 +113,270 @@ REG_OP(SparseReshape) .OUTPUT(y_shape, TensorType({DT_INT64})) .OP_END_FACTORY_REG(SparseReshape) +/** +*@brief Adds up a SparseTensor and a dense Tensor. +*@par Inputs: +*(1) Broadcasts the dense side to have the same shape as the sparse side, if eligible;\n +*(2) Then, only the dense values pointed to by the indices of the SparseTensor participate in the cwise addition. +* @li x1_indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li x1_values: The values of the SparseTensor. A vector Tensor. 1D. +* @li x1_shape: A 1D Tensor of type int64. The requested new dense shape. +* @li x2: A dense Tensor of the same type as "x1_values". + +*@par Outputs: +*y: A Tensor. Has the same type as "x1_values". + +*/ REG_OP(SparseDenseCwiseAdd) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x1_values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, \ + DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_shape, TensorType({DT_INT64})) .INPUT(x2, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(SparseDenseCwiseAdd) +/** +*@brief Divides a SparseTensor by a dense Tensor. + +*@par Inputs: +* @li x1_indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li x1_values: The values of the SparseTensor. A vector Tensor. 1D. +* @li x1_shape: A 1D Tensor of type int64. The requested new dense shape. +* @li x2: A dense Tensor of the same type as "x1_values". + +*@par Outputs: +*y: A Tensor. Has the same type as "x1_values". + +*/ REG_OP(SparseDenseCwiseDiv) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x1_values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, \ + DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_shape, TensorType({DT_INT64})) .INPUT(x2, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(SparseDenseCwiseDiv) +/** +*@brief Multiplies a SparseTensor by a dense Tensor. + +*@par Inputs: +* @li x1_indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li x1_values: The values of the SparseTensor. A vector Tensor. 1D. +* @li x1_shape: A 1D Tensor of type int64. The requested new dense shape. +* @li x2: A dense Tensor of the same type as "x1_values". + +*@par Outputs: +*y: A Tensor. Has the same type as "x1_values". + +*/ REG_OP(SparseDenseCwiseMul) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x1_values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, \ + DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_shape, TensorType({DT_INT64})) .INPUT(x2, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, \ - DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(SparseDenseCwiseMul) +/** +*@brief Adds a SparseTensor to a SparseTensorsMap. + +*@par Inputs: +* The input tensor must be a SparseTensor. +* @li x1_indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li x1_values: The values of the SparseTensor. A vector Tensor. 1D. +* @li x1_shape: A 1D Tensor of type int64. The requested new dense shape. + +*@par Attributes: +*@li container: An optional string. Defaults to " ". +*@li shared_name: An optional string. Defaults to " ". + +*@par Outputs: +*handle: A Tensor of type int64. + +*/ REG_OP(AddSparseToTensorsMap) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(shape, TensorType({DT_INT64})) .OUTPUT(handle, TensorType({DT_INT64})) .ATTR(container, String, "") .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(AddSparseToTensorsMap) +/** +*@brief The gradient operator for the SparseSlice op. + +*@par Inputs: +* @li backprop_val_grad: A Tensor. +* @li indices: A matrix Tensor of type int64. 2D. The indices of the SparseTensor. +* @li start: A 1D Tensor of type int64. The start of the slice. +* @li new_indices: A matrix Tensor of type int64. 2D. The indices of the sliced SparseTensor. + +*@par Outputs: +*y_grad: A Tensor of type int64. + +*/ REG_OP(SparseSliceGrad) .INPUT(backprop_val_grad, TensorType({ DT_INT8, DT_UINT8, DT_INT16, - DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, DT_DOUBLE })) + DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, DT_DOUBLE, + DT_COMPLEX64, DT_COMPLEX128})) .INPUT(indices, TensorType({DT_INT64})) .INPUT(start, TensorType({DT_INT64})) .INPUT(new_indices, TensorType({DT_INT64})) .OUTPUT(y_grad, TensorType({ DT_INT8, DT_UINT8, DT_INT16, - DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, DT_DOUBLE })) + DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, DT_DOUBLE, + DT_COMPLEX64, DT_COMPLEX128 })) .OP_END_FACTORY_REG(SparseSliceGrad) +/** +*@brief Slices a SparseTensor based on the "start" and "size". + +*@par Inputs: +* @li indices: A 2D Tensor of type int64. The indices of the SparseTensor. +* @li values: A 1D Tensor. The values of the SparseTensor. +* @li shape: A 2D Tensor of type int64. The shape of the SparseTensor. +* @li start: A 1D Tensor of type int64. The start of the slice. +* @li size: A 1D Tensor of type int64. The size of the slice. + +*@par Outputs: +*y_indices: A Tensor of type int64. +*y_values: A Tensor. Has the same type as "values". +*y_values: A Tensor of type int64. + +*/ REG_OP(SparseSlice) .INPUT(indices, TensorType({DT_INT64})) - .INPUT(values, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE })) + .INPUT(values, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, \ + DT_UINT8, DT_INT8, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, \ + DT_COMPLEX128, DT_BOOL, DT_STRING, DT_RESOURCE})) .INPUT(shape, TensorType({DT_INT64})) .INPUT(start, TensorType({DT_INT64})) .INPUT(size, TensorType({DT_INT64})) .OUTPUT(y_indices, TensorType({DT_INT64})) - .OUTPUT(y_values, TensorType({ DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE })) + .OUTPUT(y_values, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, \ + DT_UINT8, DT_INT8, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, \ + DT_COMPLEX128, DT_BOOL, DT_STRING, DT_RESOURCE})) .OUTPUT(y_shape, TensorType({DT_INT64})) .OP_END_FACTORY_REG(SparseSlice) +/** +*@brief The gradient operator for the SparseAdd op. + +*@par Inputs: +* @li backprop_val_grad: A 1D Tensor with shape [nnz(sum)]. The gradient with respect to the non-empty values of the sum. +* @li x1_indices: A 2D Tensor of type int64. The indices of the SparseTensor A, with size [nnz(A), ndims]. +* @li x2_indices: A 2D Tensor of type int64. The indices of the SparseTensor B, with size [nnz(B), ndims]. +* @li sum_indices: A 2D Tensor of type int64. The indices of the sum SparseTensor, with size [nnz(sum), ndims]. + +*@par Outputs: +*x1_val_grad: A Tensor. Has the same type as "backprop_val_grad". +*x2_val_grad: A Tensor. Has the same type as "backprop_val_grad". + +*/ REG_OP(SparseAddGrad) .INPUT(backprop_val_grad, TensorType({DT_INT8, DT_INT16, DT_INT32, - DT_INT64, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x2_indices, TensorType({DT_INT64})) .INPUT(sum_indices, TensorType({DT_INT64})) .OUTPUT(x1_val_grad, TensorType({DT_INT8, DT_INT16, DT_INT32, - DT_INT64, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(x2_val_grad, TensorType({DT_INT8, DT_INT16, DT_INT32, - DT_INT64, DT_FLOAT, DT_DOUBLE})) + DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(SparseAddGrad) +/** +*@brief The gradient of SparseFillEmptyRows. + +*@par Inputs: +* @li reverse_index_map: A 1D Tensor of type int64. The reverse index map from SparseFillEmptyRows. +* @li grad_values: A 1D Tensor. The gradients from backprop. + +*@par Outputs: +*@li y_value: A Tensor. Has the same type as "grad_values". +*@li y_default_value: A Tensor. Has the same type as "grad_values". + +*/ REG_OP(SparseFillEmptyRowsGrad) .INPUT(reverse_index_map, TensorType({DT_INT64})) .INPUT(grad_values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(y_value, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(y_default_value, TensorType({DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OP_END_FACTORY_REG(SparseFillEmptyRowsGrad) +/** +*@brief Multiplies SparseTensor A (of rank 2) by dense matrix B. + +*@par Inputs: +* @li x1_indices: A 2D Tensor of type int32 or int64. +* @li The indices of the matrix "SparseTensor", with size [nnz, 2]. +* @li x1_values: A 1D Tensor. The values of the SparseTensor, with size [nnz]. +* @li x1_shape: A 1D Tensor of type int64. The shape of the SparseTensor, with size [2]. +* @li x2: A dense matrix Tensor of the same type as "x1_values". 2D. + +*@par Outputs: +*y: A "Tensor". Has the same type as "x1_values". + +*@par Attributes: +*@li adjoint_a: An optional bool. Defaults to "False".Use the adjoint of A in the matrix multiply. +*@li If A is complex, this is transpose(conj(A)). Otherwise it is transpose(A). +*@li adjoint_b: An optional bool. Defaults to "False".Use the adjoint of B in the matrix multiply. +*@li If B is complex, this is transpose(conj(B)). Otherwise it is transpose(B). + +*/ REG_OP(SparseTensorDenseMatMul) .INPUT(x1_indices, TensorType({DT_INT32, DT_INT64})) - .INPUT(x1_values, TensorType({DT_FLOAT, DT_INT32, DT_DOUBLE})) + .INPUT(x1_values, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, \ + DT_COMPLEXT64, DT_COMPLEX128, DT_FLOAT16})) .INPUT(x1_shape, TensorType({DT_INT64})) - .INPUT(x2, TensorType({DT_FLOAT, DT_INT32, DT_DOUBLE})) - .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_DOUBLE})) + .INPUT(x2, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_COMPLEXT64, \ + DT_COMPLEX128, DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_COMPLEXT64, \ + DT_COMPLEX128, DT_FLOAT16})) .ATTR(adjoint_a, Bool, false) .ATTR(adjoint_b, Bool, false) .OP_END_FACTORY_REG(SparseTensorDenseMatMul) +/** +*@brief Converts a sparse representation into a dense tensor. + +*@par Inputs: +* @li indices: A 0D, 1D, or 2D Tensor of type int32 or int64. +* @li output_shape: A 1D Tensor of the same type as "sparse_indices". The shape of the dense output tensor. +* @li values: A 1D Tensor. Values corresponding to each row of "sparse_indices", +* @li or a scalar value to be used for all sparse indices. +* @li default_value: A Tensor of the same type as "sparse_values". + +*@par Outputs: +*y: A Tensor. Has the same type as "values". + +*/ REG_OP(SparseToDense) .INPUT(indices, TensorType({DT_INT32, DT_INT64})) .INPUT(output_shape, TensorType({DT_INT32, DT_INT64})) @@ -167,54 +390,148 @@ REG_OP(SparseToDense) .ATTR(validate_indices, Bool, true) .OP_END_FACTORY_REG(SparseToDense) +/** +*@brief Concatenates a list of `SparseTensor` along the specified dimension.\n +*Concatenation is with respect to the dense versions of these sparse tensors. + +*@par Inputs: +*3 or 5 inputs,contains: +* @li indices:A list of at least 2 `Tensor` objects with type `int64`.2-D. \n +*Indices of each input `SparseTensor`. +* @li values:A list with the same length as `indices` of `Tensor` objects with the same type. +* @li shapes:A list with the same length as `indices` of `Tensor` objects with type `int64`.1-D. \n +* Shapes of each `SparseTensor`. + +*@par Attributes: +*@li concat_dim: An `int` Dimension to concatenate along +*@li N:Number of sparse + +*@par Outputs: +* @li y_indices:A `Tensor` of type `int64`. +* @li y_values:A `Tensor`. Has the same type as `values`. +* @li y_shape:A `Tensor` of type `int64`. + +* Compatible SparseConcat operator in Tensorflow +*/ REG_OP(SparseConcat) .DYNAMIC_INPUT(indices, TensorType({DT_INT64})) .DYNAMIC_INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ - DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .DYNAMIC_INPUT(shapes, TensorType({DT_INT64})) .OUTPUT(y_indices, TensorType({DT_INT64})) .OUTPUT(y_values, - TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \ + DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(y_shape, TensorType({DT_INT64})) .ATTR(concat_dim, Int, 0) .ATTR(N, Int, 1) .OP_END_FACTORY_REG(SparseConcat) +/** +*@brief Adds two `SparseTensor` objects to produce another `SparseTensor`. + +*@par Inputs: +*7 inputs, contains: +* @li x1_indices:A `Tensor` of type `int64`.2-D. \n +* The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix. +* @li x1_values:A `Tensor`. Must be one of the following types:float,int8,int16,int32,int64, float64. +* @li x1_shape:A `Tensor` of type `int64`.1-D. The `shape` of the first `SparseTensor`, \n +* size `[ndims]` Vector. +* @li x2_indices:A `Tensor` of type `int64`.2-D.The `indices` of the second `SparseTensor`, \n +* size `[nnz, ndims]` Matrix. +* @li x2_values:A `Tensor`. Must have the same type as `a_values`.1-D. \n +* The `values` of the second `SparseTensor`, size `[nnz]` Vector. +* @li x2_shape:A `Tensor` of type `int64`.1-D. \n +* The `shape` of the second `SparseTensor`, size `[ndims]` Vector. +* @li thresh:A `Tensor` 0-D.The magnitude threshold that determines if an output value/index pair takes space. + +*@par Outputs: +* @li sum_indices:A `Tensor` of type `int64`. +* @li sum_values:A `Tensor`. Has the same type as `x1_values`. +* @li sum_shape:A `Tensor` of type `int64`. + +* Compatible SparseAdd operator in Tensorflow +*/ REG_OP(SparseAdd) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x1_values, TensorType({DT_FLOAT, DT_INT8, DT_INT16, \ - DT_INT32, DT_INT64, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_shape, TensorType({DT_INT64})) .INPUT(x2_indices, TensorType({DT_INT64})) .INPUT(x2_values, TensorType({DT_FLOAT, DT_INT8, DT_INT16, DT_INT32, \ - DT_INT64, DT_DOUBLE})) + DT_INT64, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x2_shape, TensorType({DT_INT64})) .INPUT(thresh, TensorType({DT_FLOAT, DT_INT8, DT_INT16, DT_INT32, \ DT_INT64, DT_DOUBLE})) .OUTPUT(sum_indices, TensorType({DT_INT64})) .OUTPUT(sum_values, TensorType({DT_FLOAT, DT_INT8, DT_INT16, \ - DT_INT32, DT_INT64, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(sum_shape, TensorType({DT_INT64})) .OP_END_FACTORY_REG(SparseAdd) +/** +*@brief Fills empty rows in the input 2-D `SparseTensor` with a default value. + +*@par Inputs: +*4 inputs,contains: +* @li indices: A `Tensor` of type `int64`.2-D. the indices of the sparse tensor. +* @li values: A `Tensor`. 1-D. the values of the sparse tensor. +* @li dense_shape: A `Tensor` of type `int64`.1-D. the shape of the sparse tensor. +* @li default_value: `Tensor`. Must have the same type as `values`.\n +*0-D. default value to insert into location `[row, 0, ..., 0]` \n +*for rows missing from the input sparse tensor. + +*@par Outputs: +* @li y_indices:A `Tensor` of type `int64`. +* @li y_values:A `Tensor`. Has the same type as `values`. +* @li empty_row_indicator:A `Tensor` of type `bool`. +* @li reverse_index_map:A `Tensor` of type `int64`. + +* Compatible SparseFillEmptyRows operator in Tensorflow +*/ REG_OP(SparseFillEmptyRows) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(dense_shape, TensorType({DT_INT64})) .INPUT(default_value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, \ DT_INT16, DT_UINT16, DT_UINT8, \ - DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(y_indices, TensorType({DT_INT64})) .OUTPUT(y_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, \ DT_INT16, DT_UINT16, DT_UINT8, \ - DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(empty_row_indicator, TensorType({DT_BOOL})) .OUTPUT(reverse_index_map, TensorType({DT_INT64})) .OP_END_FACTORY_REG(SparseFillEmptyRows) +/** +*@brief Returns the element-wise max of two SparseTensors. + +*@par Inputs: +*6 inputs,contains: +* @li x1_indices:A `Tensor` of type `int64`.2-D. \n +*`N x R` matrix with the indices of non-empty values in a SparseTensor, \n +* in the canonical lexicographic ordering. +* @li x1_values:A `Tensor`. 1-D. the values of the sparse tensor. +* @li x1_shape:A `Tensor` of type `int64`.1-D. the shape of the sparse tensor. +* @li x2_indices:A `Tensor` of type `int64`.2-D. the indices of the sparse tensor. +* @li x2_values:A `Tensor`. 1-D. Must have the same type as `x1_values`. +* @li x2_shape:A `Tensor` of type `int64`.1-D. \n +*counterpart to `a_shape` for the other operand; the two shapes must be equal. + +*@par Outputs: +* @li y_indices:A `Tensor` of type `int64`. +* @li y_values:A `Tensor`. Has the same type as `x1_values`. + +* Compatible SparseSparseMaximum operator in Tensorflow +*/ REG_OP(SparseSparseMaximum) .INPUT(x1_indices, TensorType({DT_INT64})) .INPUT(x1_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ @@ -229,20 +546,67 @@ REG_OP(SparseSparseMaximum) DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) .OP_END_FACTORY_REG(SparseSparseMaximum) +/** +*@brief Returns the element-wise min of two SparseTensors. + +*@par Inputs: +*6 inputs,contains: +* @li x1_indices:A `Tensor` of type `int64`.2-D. \n +*`N x R` matrix with the indices of non-empty values in a SparseTensor, \n +* in the canonical lexicographic ordering. +* @li x1_values:A `Tensor`. 1-D. the values of the sparse tensor. +* @li x1_shape:A `Tensor` of type `int64`.1-D. the shape of the sparse tensor. +* @li x2_indices:A `Tensor` of type `int64`.2-D. the indices of the sparse tensor. +* @li x2_values:A `Tensor`. 1-D. Must have the same type as `x1_values`. +* @li x2_shape:A `Tensor` of type `int64`.1-D. \n +*counterpart to `a_shape` for the other operand; the two shapes must be equal. + +*@par Outputs: +* @li y_indices:A `Tensor` of type `int64`. +* @li y_values:A `Tensor`. Has the same type as `x1_values`. + +* Compatible SparseSparseMinimum operator in Tensorflow +*/ REG_OP(SparseSparseMinimum) .INPUT(x1_indices, TensorType({DT_INT64})) - .INPUT(x1_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + .INPUT(x1_values, TensorType({DT_INT64, DT_INT32, \ + DT_UINT16, DT_INT16, DT_UINT8, DT_INT8, DT_FLOAT16, \ + DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x1_shape, TensorType({DT_INT64})) .INPUT(x2_indices, TensorType({DT_INT64})) - .INPUT(x2_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + .INPUT(x2_values, TensorType({DT_INT64, DT_INT32, \ + DT_UINT16, DT_INT16, DT_UINT8, DT_INT8, DT_FLOAT16, \ + DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x2_shape, TensorType({DT_INT64})) .OUTPUT(y_indices, TensorType({DT_INT64})) - .OUTPUT(y_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + .OUTPUT(y_values, TensorType({DT_INT64, DT_INT32, \ + DT_UINT16, DT_INT16, DT_UINT8, DT_INT8, DT_FLOAT16, \ + DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128})) .OP_END_FACTORY_REG(SparseSparseMinimum) +/** +*@brief Computes the max of elements across dimensions of a SparseTensor. + +*@par Inputs: +*4 or 5 inputs,contains: +* @li x_indices:A `Tensor` of type `int64`.2-D. \n +*`N x R` matrix with the indices of non-empty values in a \n +*SparseTensor, possibly not in canonical ordering. +* @li x_values:A `Tensor`. 1-D. the values of the sparse tensor. \n +*`N` non-empty values corresponding to `input_indices`. +* @li x_shape:A `Tensor` of type `int64`.1-D. Shape of the input SparseTensor. +* @li reduction_axes:A `Tensor` of type `int32`.1-D.\n +*Length-`K` vector containing the reduction axes. + +*@par Attributes: +* keep_dims:An optional `bool`. Defaults to `False`.\n +*If true, retain reduced dimensions with length 1. + +*@par Outputs: +* y:A `Tensor`. Has the same type as `input_values`. + +* Compatible SparseReduceMax operator in Tensorflow +*/ REG_OP(SparseReduceMax) .INPUT(x_indices, TensorType({DT_INT64})) .INPUT(x_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ @@ -254,6 +618,31 @@ REG_OP(SparseReduceMax) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(SparseReduceMax) +/** +*@brief Computes the max of elements across dimensions of a SparseTensor. + +*@par Inputs: +*4 or 5 inputs,contains: +* @li x_indices:A `Tensor` of type `int64`.2-D. \n +*`N x R` matrix with the indices of non-empty values in a \n +*SparseTensor, possibly not in canonical ordering. +* @li x_values:A `Tensor`. 1-D. the values of the sparse tensor. \n +*`N` non-empty values corresponding to `input_indices`. +* @li x_shape:A `Tensor` of type `int64`.1-D. Shape of the input SparseTensor. +* @li reduction_axes:A `Tensor` of type `int32`.1-D.\n +*Length-`K` vector containing the reduction axes. + +*@par Attributes: +* keep_dims:An optional `bool`. Defaults to `False`.\n +*If true, retain reduced dimensions with length 1. + +*@par Outputs: +* @li y_indices:A `Tensor` of type `int64`. +* @li y_values:A `Tensor`. Has the same type as `input_values`. +* @li y_shape:A `Tensor` of type `int64`. + +* Compatible SparseReduceMaxSparse operator in Tensorflow +*/ REG_OP(SparseReduceMaxSparse) .INPUT(x_indices, TensorType({DT_INT64})) .INPUT(x_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ @@ -267,44 +656,148 @@ REG_OP(SparseReduceMaxSparse) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(SparseReduceMaxSparse) +/** +*@brief Computes the sum of elements across dimensions of a SparseTensor. + +*@par Inputs: +*4 or 5 inputs, including: +* @li x_indices: A 2D Tensor of type int64. +*"N x R" matrix with the indices of non-empty values in a \n +*SparseTensor, possibly not in canonical ordering. +* @li x_values: A 1D Tensor. The values of the SparseTensor. +*"N" non-empty values corresponding to "input_indices". +* @li x_shape: A 1D Tensor of type int64. Shape of the input SparseTensor. +* @li reduction_axes: A 1D Tensor of type int32. \n +*A length-"K" vector containing the reduction axes. + +*@par Attributes: +* keep_dims: An optional bool. Defaults to "False". \n +*If true, retains reduced dimensions with length 1. + +*@par Outputs: +* @li y_indices: A Tensor of type int64. +* @li y_values: A Tensor. Has the same type as "input_values". +* @li y_shape: A Tensor of type int64. + +*/ REG_OP(SparseReduceSum) .INPUT(x_indices, TensorType({DT_INT64})) .INPUT(x_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x_shape, TensorType({DT_INT64})) .INPUT(reduction_axes, TensorType({DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(SparseReduceSum) +/** +*@brief Computes the sum of elements across dimensions of a SparseTensor. + +*@par Inputs: +*4 or 5 inputs, including: +* @li x_indices: A 2D Tensor of type int64. +*"N x R" matrix with the indices of non-empty values in a \n +*SparseTensor, possibly not in canonical ordering. +* @li x_values: A 1D Tensor. The values of the SparseTensor. +*"N" non-empty values corresponding to "input_indices". +* @li x_shape: A 1D Tensor of type int64. Shape of the input SparseTensor. +* @li reduction_axes: A 1D Tensor of type int32. \n +* A length-"K" vector containing the reduction axes. + +*@par Attributes: +* keep_dims: An optional bool. Defaults to "False".\n +*If true, retains reduced dimensions with length 1. + +*@par Outputs: +* @li y_indices: A Tensor of type int64. +* @li y_values: A Tensor. Has the same type as "input_values". +* @li y_shape: A Tensor of type int64. + +*/ REG_OP(SparseReduceSumSparse) .INPUT(x_indices, TensorType({DT_INT64})) .INPUT(x_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .INPUT(x_shape, TensorType({DT_INT64})) .INPUT(reduction_axes, TensorType({DT_INT32})) .OUTPUT(y_indices, TensorType({DT_INT64})) .OUTPUT(y_values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE})) + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128})) .OUTPUT(y_shape, TensorType({DT_INT64})) .ATTR(keep_dims, Bool, false) .OP_END_FACTORY_REG(SparseReduceSumSparse) +/** +*@brief Splits a SparseTensor into "num_split" tensors along one dimension. + +*@par Inputs: +*4 or 5 inputs, including: +* @li split_dim: A 0D Tensor of type int64.\n +*The dimension along which to split. Must be in the range "[0, rank(shape))". +* @li indices: A 2D Tensor of type int64.\n +* The indices of the SparseTensor. +* @li values: A 1D Tensor. The values of the SparseTensor. +* @li shape: A 1D Tensor of type int64. Shape of the SparseTensor. + +*@par Attributes: +* num_split: An int that is >= 1. The number of ways to split. + +*@par Outputs: +* @li y_indices: A list of "num_split" Tensor objects of type int64. +* @li y_values: A list of "num_split" Tensor objects with the same type as "values". +* @li y_shape: A list of "num_split" Tensor objects of type int64. + +*/ REG_OP(SparseSplit) .INPUT(split_dim, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT64})) - .INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(values, TensorType({DT_INT64, DT_INT32, DT_UINT16, DT_INT16, \ + DT_UINT8, DT_INT8, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, \ + DT_COMPLEX128, DT_BOOL, DT_STRING, DT_RESOURCE})) .INPUT(shape, TensorType({DT_INT64})) .DYNAMIC_OUTPUT(y_indices, TensorType({DT_INT64})) - .DYNAMIC_OUTPUT(y_values, TensorType({DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_BOOL, \ - DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .DYNAMIC_OUTPUT(y_values, TensorType({DT_INT64, DT_INT32, DT_UINT16, \ + DT_INT16, DT_UINT8, DT_INT8, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_BOOL, DT_STRING, DT_RESOURCE})) .DYNAMIC_OUTPUT(y_shape, TensorType({DT_INT64})) .ATTR(num_split, Int, 1) .OP_END_FACTORY_REG(SparseSplit) +/** +*@brief Generates sparse cross from a list of sparse and dense tensors. + +*@par Inputs: +*8 or 10 inputs, including: +* @li indices: A list of 2D Tensor objects of type int64. +* Indices of each input SparseTensor. +* @li values: A list of 1D Tensor objects of type int64 or string. +* Values of each SparseTensor. +* @li shapes: A list with the same length as "indices" of 1D Tensor objects of type int64. +* Shapes of each SparseTensor. +* @li dense_inputs: A list of 2D Tensor objects of type int64 or string. +* Columns represented by dense Tensor. + +*@par Attributes: +* @li N: number of sparse. +* @li hashed_output: A bool. If true, returns the hash of the cross instead of the string. +* @li num_buckets: An int that is >= 0. It is used if "hashed_output" is true. \n +*output = hashed_value%num_buckets if num_buckets > 0 else "hashed_value". +* @li hash_key: An int. Specify the hash_key that will be used by the "FingerprintCat64"\n +*function to combine the crosses fingerprints. +* @li out_type: An int64 or string. +* @li internal_type: An int64 or string. + +*@par Outputs: +* @li output_indices: A Tensor of type int64. +* @li output_values: A Tensor of type "out_type". +* @li output_shape: A Tensor of type int64. + +*/ REG_OP(SparseCross) .DYNAMIC_INPUT(indices, TensorType({DT_INT64})) .DYNAMIC_INPUT(values, TensorType({DT_INT64, DT_STRING})) @@ -321,59 +814,171 @@ REG_OP(SparseCross) .REQUIRED_ATTR(internal_type, Type) .OP_END_FACTORY_REG(SparseCross) +/** +*@brief Generates sparse cross from a list of sparse and dense tensors. + +*@par Inputs: +*3 or 5 inputs, including: +* @li indices: A 2D Tensor of type int64. \n +* The "indices" of the minibatch SparseTensor. +* @li values: A 1D Tensor. The "values" of the minibatch SparseTensor. +* @li shape: A 1D Tensor of type int64. The "shape" of the minibatch SparseTensor. + +*@par Attributes: +* @li container: An optional string. Defaults to "". \n +*The container name for the "SparseTensorsMap" created by this op. +* @li shared_name: An optional string. Defaults to "". \n +*The shared name for the "SparseTensorsMap" created by this op. + +*@par Outputs: +* handles: A Tensor of type int64. + +*/ REG_OP(AddManySparseToTensorsMap) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(shape, TensorType({DT_INT64})) .OUTPUT(handles, TensorType({DT_INT64})) .ATTR(container, String, "") .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(AddManySparseToTensorsMap) +/** +*@brief Reads SparseTensors from a "SparseTensorsMap" and concatenate them. + +*@par Inputs: +*2 or 4 inputs, including: +* handles: A 1D Tensor of type int64. \n +* The "N" serialized SparseTensor objects. + +*@par Attributes: +* @li dtype: A tf.DType. The "dtype" of the SparseTensor objects stored in the "SparseTensorsMap". +* @li container: An optional string. Defaults to "". \n +*The container name for the "SparseTensorsMap" read by this op. +* @li shared_name: An optional string. Defaults to "". \n +*The shared name for the "SparseTensorsMap" read by this op. + +*@par Outputs: +* @li indices: A Tensor of type int64. +* @li values: A Tensor of type "dtype". +* @li shape: A Tensor of type int64. + +*/ REG_OP(TakeManySparseFromTensorsMap) .INPUT(handles, TensorType({DT_INT64})) .OUTPUT(indices, TensorType({DT_INT64})) .OUTPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLAOT16})) + DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) .OUTPUT(shape, TensorType({DT_INT64})) .REQUIRED_ATTR(dtype, Type) .ATTR(container, String, "") .ATTR(shared_name, String, "") .OP_END_FACTORY_REG(TakeManySparseFromTensorsMap) +/** +*@brief Serializes a SparseTensor into a [3] Tensor object. + +*@par Inputs: +*3 or 4 inputs, including: +* @li indices: A 2D Tensor of type int64. The indices of the SparseTensor. +* @li values: A 1D Tensor. The values of the SparseTensor. +* @li shape: A 1D Tensor of type int64. The shape of the SparseTensor. + +*@par Attributes: +* out_type: An optional type. Defaults to "string". + +*@par Outputs: +* serialized_sparse: A Tensor of type "out_type". + +*/ REG_OP(SerializeSparse) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLAOT16})) + DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(shape, TensorType({DT_INT64})) .OUTPUT(serialized_sparse, TensorType({DT_STRING})) .ATTR(out_type, Type, DT_STRING) .OP_END_FACTORY_REG(SerializeSparse) +/** +*@brief Serializes an "N"-minibatch SparseTensor into an [N, 3] Tensor object. + +*@par Inputs: +*3 or 4 inputs, including: +* @li indices: A 2D Tensor of type int64. The "indices" of the minibatch SparseTensor. +* @li values: A 1D Tensor. The "values" of the minibatch SparseTensor. +* @li shape: A 1D Tensor of type int64. The "shape" of the minibatch SparseTensor. + +*@par Attributes: +* out_type: An optional type. Defaults to "string". + +*@par Outputs: +* serialized_sparse: A Tensor of type "out_type". + +*/ REG_OP(SerializeManySparse) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLAOT16})) + DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(shape, TensorType({DT_INT64})) .OUTPUT(serialized_sparse, TensorType({DT_STRING})) .ATTR(out_type, Type, DT_STRING) .OP_END_FACTORY_REG(SerializeManySparse) +/** +*@brief Deserializes SparseTensor objects. + +*@par Inputs: +*Two inputs, including: +* serialized_sparse: A Tensor. The serialized SparseTensor objects. \n +*The last dimension must have 3 columns. + +*@par Attributes: +* dtype: An optional type. The type of the serialized SparseTensor objects. + +*@par Outputs: +* @li indices: A Tensor of type int64. +* @li values: A Tensor of type "dtype". +* @li shape: A Tensor of type int64. + +*/ REG_OP(DeserializeSparse) .INPUT(serialized_sparse, TensorType({DT_STRING})) .OUTPUT(indices, TensorType({DT_INT64})) .OUTPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLAOT16})) + DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(shape, TensorType({DT_INT64})) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(DeserializeSparse) +/** +*@brief Deserializes and concatenates SparseTensors from a serialized minibatch. + +*@par Inputs: +*Two inputs, including: +* serialized_sparse: A 2D Tensor of type string. \n +*The "N" serialized SparseTensor objects. Must have 3 columns. + +*@par Attributes: +* dtype: An optional type. The type of the serialized SparseTensor objects. + +*@par Outputs: +* @li indices: A Tensor of type int64. +* @li values: A Tensor of type "dtype". +* @li shape: A Tensor of type int64. + +*/ REG_OP(DeserializeManySparse) .INPUT(serialized_sparse, TensorType({DT_STRING})) .OUTPUT(indices, TensorType({DT_INT64})) .OUTPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLAOT16})) + DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, \ + DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .OUTPUT(shape, TensorType({DT_INT64})) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(DeserializeManySparse) diff --git a/third_party/fwkacllib/inc/ops/split_combination_ops.h b/third_party/fwkacllib/inc/ops/split_combination_ops.h index 2deeef7e..734847f4 100644 --- a/third_party/fwkacllib/inc/ops/split_combination_ops.h +++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h @@ -19,34 +19,119 @@ #include "../graph/operator_reg.h" namespace ge { +/** +*@brief Splits a tensor along dimension "split_dim" into "num_split" smaller tensors. + +*@par Inputs: +* Two inputs, including: +*@li x: An ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 +*@li split_dim: Must be the following type:int32. Specifies the dimension along which to split. + +*@par Attributes: +*num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value. + +*@par Outputs: +*y: Dynamic output.A list of output tensors. Has the same type and format as "x". + +*@attention Constraints: +*@li "num_split" is greater than or equals to 1. +*@li "num_split" is divisible by the size of dimension "split_dim". +*@li "split_dim" is in the range [-len(x.shape), (x.shape)-1]. + +*/ REG_OP(Split) .INPUT(split_dim, TensorType({DT_INT32})) - .INPUT(value, TensorType::BasicType()) - .DYNAMIC_OUTPUT(output, TensorType::BasicType()) + .INPUT(x, TensorType::BasicType()) + .DYNAMIC_OUTPUT(y, TensorType::BasicType()) .REQUIRED_ATTR(num_split, Int) .OP_END_FACTORY_REG(Split) +/** +*@brief Splits a tensor along dimension "split_dim" into "num_split" smaller tensors. + +*@par Inputs: +* One input: +*: An ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 + +*@par Attributes: +*@li split_dim: A required int8, int16, int32, or int64. Specifies the dimension along which to split. No default value. +*@li num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value. + +*@par Outputs: +*y:Dynamic output. A list of output tensors. Has the same type and format as "x". + +*@attention Constraints: +*@li "num_split" is greater than or equals to 1. +*@li "num_split" is divisible by the size of dimension "split_dim". +*@li "split_dim" is in the range [-len(x.shape), (x.shape)-1]. + +*/ REG_OP(SplitD) - .INPUT(value, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, + .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_FLOAT, DT_FLOAT16})) - .DYNAMIC_OUTPUT(output, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, + .DYNAMIC_OUTPUT(y, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_FLOAT, DT_FLOAT16})) .REQUIRED_ATTR(split_dim, Int) .REQUIRED_ATTR(num_split, Int) .OP_END_FACTORY_REG(SplitD) +/** +*@brief Splits a tensor along dimension "split_dim" into "num_split" smaller tensors according to "size_splits". + +*@par Inputs: +* Three inputs, including: +*@li x: An ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 +*@li size_splits: A list of int8, int16, int32, or int64. Specifies a list containing the sizes of each output tensor along the split dimension. +*@li split_dim: An int8, int16, int32, or int64. Specifies the dimension along which to split. + +*@par Attributes: +*num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value. + +*@par Outputs: +*y: Dynamic output.A list of output tensors. Has the same type and format as "x". + +*@attention Constraints: +*@li Each element in "size_splits" is greater than or equal to 1. +*@li "size_splits" and "num_split" have the same length. +*@li The elements in "size_splits" sum to the size of dimension "split_dim". + +*/ REG_OP(SplitV) - .INPUT(input_value, TensorType::BasicType()) - .INPUT(input_size_splits, TensorType::IndexNumberType()) - .INPUT(input_split_dim, TensorType({DT_INT32})) - .DYNAMIC_OUTPUT(output_data, TensorType::BasicType()) + .INPUT(x, TensorType::BasicType()) + .INPUT(size_splits, TensorType::IndexNumberType()) + .INPUT(split_dim, TensorType({DT_INT32})) + .DYNAMIC_OUTPUT(y, TensorType::BasicType()) .REQUIRED_ATTR(num_split, Int) .OP_END_FACTORY_REG(SplitV) +/** +*@brief Splits a tensor along dimension "split_dim" into "num_split" smaller tensors according to "size_splits". + +*@par Inputs: +* One input: +* x: An ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 + +*@par Attributes: +*@li size_splits: A required list of int8, int16, int32, or int64. Specifies a list containing the sizes of each output tensor along the split dimension. +*@li split_dim: A required int8, int16, int32, or int64. Specifies the dimension along which to split. No default value. +*@li num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value. + +*@par Outputs: +*y: Dynamic output.A list of output tensors. Has the same type and format as "x". + +*@attention Constraints: +*@li Each element in "size_splits" is greater than or equal to 1. +*@li "size_splits" and "num_split" have the same length. +*@li The elements in "size_splits" sum to the size of dimension "split_dim". +*/ REG_OP(SplitVD) - .INPUT(input_value, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, + .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_FLOAT, DT_FLOAT16})) - .DYNAMIC_OUTPUT(output_data, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, + .DYNAMIC_OUTPUT(y, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, DT_FLOAT, DT_FLOAT16})) .REQUIRED_ATTR(size_splits, ListInt) .REQUIRED_ATTR(split_dim, Int) @@ -66,7 +151,8 @@ REG_OP(SplitVD) * but with the number of input values in the first dimension. *@par Attributes: -* shape: A required list of ints. +* @li shape: A required list of ints. +* @li N: The numble of dynamic_input "values". *@par Outputs: *output_data: The concatenated tensor with same type as "values". @@ -74,48 +160,127 @@ REG_OP(SplitVD) REG_OP(ParallelConcat) .DYNAMIC_INPUT(values, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) .OUTPUT(output_data, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) - .REQUIRED_ATTR(shape, ListInt) - .OP_END_FACTORY_REG(ParallelConcat) - -REG_OP(ConcatExt2) - .DYNAMIC_INPUT(input_values, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_INT64, DT_UINT64, DT_UINT32, DT_INT16, DT_UINT16, DT_UINT8})) - .OUTPUT(output_data, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_INT64, DT_UINT64, DT_UINT32, DT_INT16, DT_UINT16, DT_UINT8})) - .REQUIRED_ATTR(axis, Int) + .REQUIRED_ATTR(shape, ListInt) .REQUIRED_ATTR(N, Int) - .OP_END_FACTORY_REG(ConcatExt2) + .OP_END_FACTORY_REG(ParallelConcat) + +/** +*@brief Concatenates tensors along one dimension. + +*@par Inputs: +* One input: +*x: Dynamic input.An NC1HWC0 or ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 + +*@par Attributes: +*concat_dim: A required int8, int16, int32, or int64. Specifies the dimension along which to concatenate. No default value. + +*@par Outputs: +*y: A Tensor. Has the same type and format as "x". + +*@attention Constraints: +*@li "x" is a list of at least 2 "tensor" objects of the same type. +*@li "concat_dim" is in the range [-len(x.shape), len(x.shape)]. + +*/ +REG_OP(ConcatV2D) + .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_INT64, DT_UINT64, DT_UINT32, DT_INT16, DT_UINT16, DT_UINT8})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_INT64, DT_UINT64, DT_UINT32, DT_INT16, DT_UINT16, DT_UINT8})) + .REQUIRED_ATTR(concat_dim, Int) + .ATTR(N, Int, 1) + .OP_END_FACTORY_REG(ConcatV2D) + +/** +*@brief Concatenates tensors along one dimension. + +*@par Inputs: +* Two inputs, including: +*@li Dynamic input "x" is An NC1HWC0 or ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 +*@li concat_dim: An int8, int16, int32, or int64. Specifies the dimension along which to concatenate. +*@par Attributes: +*N: An optional int8, int16, int32, or int64. Specifies the number of elements in "x". No default value. + +*@par Outputs: +*y: A Tensor. Has the same type and format as "x". + +*@attention Constraints: +* "x" is a list of at least 2 "tensor" objects of the same type. + +*/ REG_OP(ConcatV2) - .DYNAMIC_INPUT(input_values, TensorType::BasicType()) - .INPUT(axis, TensorType::IndexNumberType()) - .OUTPUT(output_data, TensorType::BasicType()) - .REQUIRED_ATTR(N, Int) + .DYNAMIC_INPUT(x, TensorType::BasicType()) + .INPUT(concat_dim, TensorType::IndexNumberType()) + .OUTPUT(y, TensorType::BasicType()) + .ATTR(N, Int, 1) .OP_END_FACTORY_REG(ConcatV2) +/** +*@brief Concatenates tensors along one dimension. + +*@par Inputs: +* One input: +*x:Dynamic input. An NC1HWC0 or ND Tensor. \n +*Must be one of the following types: \n float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 + +*@par Attributes: +*@li concat_dim: A required int8, int16, int32, or int64. Specifies the dimension along which to concatenate. No default value. +*@li N: An optional int8, int16, int32, or int64. Specifies the number of elements in "x". No default value. + +*@par Outputs: +*y: A Tensor. Has the same type and format as "x". + +*@attention Constraints: +*@li "x" is a list of at least 2 "tensor" objects of the same type. +*@li "concat_dim" is in the range [-len(x.shape), len(x.shape)]. + +*/ REG_OP(ConcatD) - .DYNAMIC_INPUT(input_values, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) - .OUTPUT(output_data, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) + .DYNAMIC_INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) + .OUTPUT(y, TensorType({DT_FLOAT,DT_FLOAT16,DT_INT8,DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_UINT32,DT_UINT64})) .REQUIRED_ATTR(concat_dim, Int) - .REQUIRED_ATTR(N, Int) + .ATTR(N, Int, 1) .OP_END_FACTORY_REG(ConcatD) +/** +*@brief Concatenates tensors along one dimension. + +*@par Inputs: +* Two inputs, including: +*@li x: Dynamic input.An NC1HWC0 or ND Tensor. \n +*Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64 +*@li concat_dim: An int8, int16, int32, or int64. Specifies the dimension along which to concatenate. + +*@par Attributes: +*N: An optional int8, int16, int32, or int64. Specifies the number of elements in "x". + +*@par Outputs: +*y: A Tensor. Has the same type and format as "x". + +*@attention Constraints: +*@li "x" is a list of at least 2 "tensor" objects of the same type. +*@li "concat_dim" is in the range [-len(x.shape), len(x.shape)]. + +*/ REG_OP(Concat) - .DYNAMIC_INPUT(input_values, TensorType::BasicType()) + .DYNAMIC_INPUT(x, TensorType::BasicType()) .INPUT(concat_dim, TensorType::IndexNumberType()) - .OUTPUT(output_data, TensorType::BasicType()) - .REQUIRED_ATTR(N, Int) + .OUTPUT(y, TensorType::BasicType()) + .ATTR(N, Int, 1) .OP_END_FACTORY_REG(Concat) /** *@brief Packs the list of tensors in values into a tensor with rank one higher than each tensor in -* values, by packing them along the axis dimension. Given a list of length N of tensors of +* values, by packing them along the axis dimension. Given a list of length N of tensors of * shape (A, B, C); if axis == 0 then the output tensor will have the shape (N, A, B, C). *@par Inputs: -* x: A list of N Tensors. Must be one of the following types: int8, int16, int32, +* x: A list of N Tensors. Must be one of the following types: int8, int16, int32, * int64, uint8, uint16, uint32, uint64, float16, float32, bool. *@par Attributes: -*@li axis: A required int. +*@li axis: A optional int, defaultvalue is 0. * Dimension along which to pack. The range is [-(R+1), R+1). *@li N: A required int. Number of tensors. @@ -125,7 +290,7 @@ REG_OP(Concat) REG_OP(Pack) .DYNAMIC_INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) - .REQUIRED_ATTR(axis, Int) + .ATTR(axis, Int, 0) .REQUIRED_ATTR(N, Int) .OP_END_FACTORY_REG(Pack) @@ -139,7 +304,7 @@ REG_OP(Pack) *@par Attributes: *@li Concat_dim: A required int. Must be within the rank of input "x". -*@li N: A required int. +*@li N: A required int. *@par Outputs: *y: A Tensor list with same type as "x". @@ -161,7 +326,7 @@ REG_OP(ConcatOffset) *@par Attributes: *@li Concat_dim: A required int. Must be within the rank of input "x". -*@li N: A required int. +*@li N: A required int. *@par Outputs: *y: A Tensor list with same type as "x". diff --git a/third_party/fwkacllib/inc/ops/ssddetectionoutput_ops.h b/third_party/fwkacllib/inc/ops/ssddetectionoutput_ops.h new file mode 100644 index 00000000..7c50db14 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/ssddetectionoutput_ops.h @@ -0,0 +1,65 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_SSDDETECTIONOUTPUT_OPS_H_ +#define GE_OP_SSDDETECTIONOUTPUT_OPS_H_ +#include "graph/operator_reg.h" + +namespace ge { +/** +*@brief Returns detection result. + +*@par Inputs: +* Four inputs, including: +*@li mbox_conf: An ND tensor of type floa16 or float32, specifying the box confidences data, used as the input of operator SSDDetectionOutput. +*@li mbox_loc: An ND tensor of type floa16 or float32, specifying the box loc predictions, used as the input of operator SSDDetectionOutput. +*@li mbox_priorbox: An ND tensor of type floa16 or float32, output from operator PriorBoxD, used as the input of operator SSDDetectionOutput. +*@par Attributes: +*@li num_classes: An optional int32, specifying the number of classes to be predicted. Defaults to "2". The value must be greater than 1 and lesser than 1025. +*@li share_location: An option bool, specify the shared location. Defaults to True +*@li background_label_id: An option int32, specify the background label id. Must be 0 +*@li nms_threshold: An option float32, specify the nms threshold +*@li top_k: An option int32, specify the topk value. Defaults to 200 +*@li eta: An option float32, specify the eta value. Defaults to 1 +*@li variance_encoded_in_target: An option bool, specify whether variance encoded in target or not. Defaults to False +*@li code_type: An option int32, specify the code type. Defaults to 1(only supports 2). The corner is 1, center_size is 2, corner_size is 3 +*@li keep_top_k: An option int32, specify the topk value after nms. Defaults to -1 +*@li confidence_threshold: An option float32, specify the topk filter threshold. Only consider detections with confidence greater than the threshold +*@li kernel_name: An optional string, specifying the operator name. Defaults to "ssd_detection_output". +*@par Outputs: +*out_boxnum: An NCHW tensor of type int32, specifying the number of output boxes. +*y: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. + +*/ +REG_OP(SSDDetectionOutput) + .INPUT(bbox_delta, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(score, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(anchors, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(out_boxnum, TensorType({DT_INT32})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(num_classes, Int, 2) + .ATTR(share_location, Bool, true) + .ATTR(background_label_id, Int, 0) + .ATTR(iou_threshold, Float, 0.3) + .ATTR(top_k, Int, 200) + .ATTR(eta, Float, 1.0) + .ATTR(variance_encoded_in_target, Bool, false) + .ATTR(code_type, Int, 1) + .ATTR(keep_top_k, Int, -1) + .ATTR(confidence_threshold, Float, 0.0) + .OP_END_FACTORY_REG(SSDDetectionOutput) +} +#endif diff --git a/third_party/fwkacllib/inc/ops/state_ops.h b/third_party/fwkacllib/inc/ops/state_ops.h index 524901f6..879d7c67 100644 --- a/third_party/fwkacllib/inc/ops/state_ops.h +++ b/third_party/fwkacllib/inc/ops/state_ops.h @@ -55,21 +55,21 @@ pass the reference to the variable tensor to the matching DestroyTemporaryVariab *@par Attributes: *@li shape: A required list of int32 or int64. The shape of the variable tensor. *@li dtype: Required. The type of elements in the variable tensor. -*@li var_name: An optional string. The name of the variable to be created. +*@li var_name: An optional string. The name of the variable to be created. *@par Outputs: *y: The created variable tensor. */ REG_OP(TemporaryVariable) .OUTPUT(y, TensorType::ALL()) - .ATTR(shape, ListInt, {}) - .ATTR(dtype, Int, 0) + .REQUIRED_ATTR(shape, ListInt) + .REQUIRED_ATTR(dtype, Int) .ATTR(var_name, String, "") .OP_END_FACTORY_REG(TemporaryVariable) /** *@brief Destroys the temporary variable and returns its final value. \n -All other uses of the temporary variable must have been executed before this op. +All other uses of the temporary variable must have been executed before this op. *@par Inputs: *x: A reference to the temporary variable tensor. @@ -102,6 +102,30 @@ REG_OP(IsVariableInitialized) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(IsVariableInitialized) +REG_OP(VarIsInitializedOp) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, + DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_BOOL})) + .OP_END_FACTORY_REG(VarIsInitializedOp) + +/** +*@brief Increments 'ref' until it reaches 'limit'. + +*@par Inputs: +*Inputs include: \n +*ref: A mutable Tensor. Must be one of the following types: int32, int64. + +*@par Attributes: +*limit: An int. If incrementing ref would bring it above limit, instead \n + generates an 'OutOfRange' error. + +*@par Outputs: +*y: A Tensor. Has the same type as ref. + +*@attention Constraints:\n +*-The implementation for CountUpTo on Ascend uses AICPU, with bad performance.\n + +*/ REG_OP(CountUpTo) .INPUT(ref, TensorType({DT_INT32, DT_INT64})) .OUTPUT(y, TensorType({DT_INT32, DT_INT64})) diff --git a/third_party/fwkacllib/inc/ops/stateful_random_ops.h b/third_party/fwkacllib/inc/ops/stateful_random_ops.h new file mode 100644 index 00000000..929481d5 --- /dev/null +++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h @@ -0,0 +1,216 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_STATEFUL_RANDOM_OPS_H +#define GE_OP_STATEFUL_RANDOM_OPS_H + +#include "graph/operator.h" +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Non-deterministically generates some integers. + +*@par Inputs: +*This op may use some OS-provided source of non-determinism (e.g. an RNG), \n +*so each execution will give different results. Inputs included: +*@li shape: The shape of the output tensor. + +*@par Outputs: +*y:A Returns Non-deterministic integer values with specified shape. + +*/ + +REG_OP(NonDeterministicInts) + .INPUT(shape, TensorType({DT_INT32,DT_INT64})) + .OUTPUT(y, TensorType({DT_INT32,DT_INT64})) + .REQUIRED_ATTR(dtype, Type) + .OP_END_FACTORY_REG(NonDeterministicInts) + +/** +*@brief Advance the counter of a counter-based RNG. The state of the RNG after \n +*`rng_skip(n)` will be the same as that after `stateful_uniform([n])` \n +*(or any other distribution). The actual increment added to the \n +*counter is an unspecified implementation detail. + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li delta: The amount of advancement. + +*@par Outputs: +*y:A Returns the created operation. + +*/ + +REG_OP(RngSkip) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(delta, TensorType({DT_INT64})) + .OP_END_FACTORY_REG(RngSkip) + +/** +*@brief Outputs random integers from a uniform distribution. \n +The generated values are uniform integers in the range `[minval, maxval)`. \n +The lower bound `minval` is included in the range, while the upper bound \n +`maxval` is excluded. \n +The random integers are slightly biased unless `maxval - minval` is an exact \n +power of two. The bias is small for values of `maxval - minval` significantly \n +smaller than the range of the output (either `2^32` or `2^64`). + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li shape: The shape of the output tensor. +*@li minval: Minimum value (inclusive, scalar). +*@li maxval: Maximum value (exclusive, scalar). + +*@par Outputs: +*y:A Returns Random values with specified shape. + +*/ + +REG_OP(StatefulRandomBinomial) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT32, DT_INT64})) + .INPUT(counts, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) + .INPUT(probs, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) + .REQUIRED_ATTR(dtype, Type) + .OP_END_FACTORY_REG(StatefulRandomBinomial) + +/** +*@brief Outputs random values from a normal distribution. \n +*The generated values will have mean 0 and standard deviation 1. + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li shape: The shape of the output tensor. + +*@par Outputs: +*y:A Returns A tensor of the specified shape filled with random normal values. + +*/ + +REG_OP(StatefulStandardNormalV2) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT64})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .OP_END_FACTORY_REG(StatefulStandardNormalV2) + +/** +*@brief Outputs random values from a truncated normal distribution. \n +*The generated values follow a normal distribution with mean 0 and standard \n +*deviation 1, except that values whose magnitude is more than 2 standard \n +*deviations from the mean are dropped and re-picked. + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li shape: The shape of the output tensor. + +*@par Outputs: +*y:A Returns Random values with specified shape. + +*/ + +REG_OP(StatefulTruncatedNormal) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT64})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .OP_END_FACTORY_REG(StatefulTruncatedNormal) + +/** +*@brief Outputs random values from a uniform distribution. \n +The generated values follow a uniform distribution in the range `[0, 1)`. The \n +lower bound 0 is included in the range, while the upper bound 1 is excluded. \n + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li shape: The shape of the output tensor. + +*@par Outputs: +*y:A Returns Random values with specified shape. + +*/ + +REG_OP(StatefulUniform) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT64})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .OP_END_FACTORY_REG(StatefulUniform) + +/** +*@brief Outputs random integers from a uniform distribution. \n +The generated values are uniform integers covering the whole range of `dtype`. + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li shape: The shape of the output tensor. + +*@par Outputs: +*y:A Returns Random values with specified shape. + +*/ + +REG_OP(StatefulUniformFullInt) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT64})) + .OUTPUT(y, TensorType({DT_INT64})) + .OP_END_FACTORY_REG(StatefulUniformFullInt) + +/** +*@brief Outputs random integers from a uniform distribution. \n +The generated values are uniform integers in the range `[minval, maxval)`. \n +The lower bound `minval` is included in the range, while the upper bound \n +`maxval` is excluded. \n +The random integers are slightly biased unless `maxval - minval` is an exact \n +power of two. The bias is small for values of `maxval - minval` significantly \n +smaller than the range of the output (either `2^32` or `2^64`). + +*@par Inputs: +*@li resource: The handle of the resource variable that stores the state of the RNG. +*@li algorithm: The RNG algorithm. +*@li shape: The shape of the output tensor. +*@li minval: Minimum value (inclusive, scalar). +*@li maxval: Maximum value (exclusive, scalar). + +*@par Outputs: +*y:A Returns Random values with specified shape. + +*/ + +REG_OP(StatefulUniformInt) + .INPUT(x, TensorType({DT_RESOURCE})) + .INPUT(algorithm, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT64})) + .INPUT(minval, TensorType({DT_INT64})) + .INPUT(maxval, TensorType({DT_INT64})) + .OUTPUT(y, TensorType({DT_INT64})) + .OP_END_FACTORY_REG(StatefulUniformInt) + +} // namespace ge + +#endif //GE_OP_STATELESS_RANDOM_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/stateless_random_ops.h b/third_party/fwkacllib/inc/ops/stateless_random_ops.h index c793574b..bb8e015a 100644 --- a/third_party/fwkacllib/inc/ops/stateless_random_ops.h +++ b/third_party/fwkacllib/inc/ops/stateless_random_ops.h @@ -22,6 +22,25 @@ namespace ge { +/** +*@brief Draws samples from a multinomial distribution. + +*@par Inputs: +include: \n +*@li logits:2-D Tensor with shape [batch_size, num_classes]. Each slice [i, :]\n +*represents the unnormalized log probabilities for all classes. +*@li num_samples:0-D. Number of independent samples to draw for each row slice. +*@li seed:The seed to generate random. + +*@par Attributes: +*output_dtype:Output data type. + +*@par Outputs: +*y:Output random number. + +*@see StatelessMultinomial() + +*/ REG_OP(StatelessMultinomial) .INPUT(logits, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE})) .INPUT(num_samples, TensorType({DT_INT32})) @@ -30,6 +49,28 @@ REG_OP(StatelessMultinomial) .ATTR(output_dtype, Type, DT_INT64) .OP_END_FACTORY_REG(StatelessMultinomial) +/** +*@brief Outputs deterministic pseudorandom random integers from a uniform distribution. + +*@par Inputs: +*@li shape: The shape of the output tensor. +*@li seed: 2 seeds (shape [2]). +*@li minval: Minimum value (inclusive, scalar). +*@li maxval: Maximum value (exclusive, scalar). + +*@par Outputs: +*y: Returns Random values with specified shape. + +*/ + +REG_OP(StatelessRandomUniformInt) + .INPUT(shape, TensorType({DT_INT32, DT_INT64})) + .INPUT(seed, TensorType({DT_INT32, DT_INT64})) + .INPUT(minval, TensorType({DT_INT32, DT_INT64})) + .INPUT(maxval, TensorType({DT_INT32, DT_INT64})) + .OUTPUT(y, TensorType({DT_INT32, DT_INT64})) + .OP_END_FACTORY_REG(StatelessRandomUniformInt) + } // namespace ge #endif //GE_OP_STATELESS_RANDOM_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/string_ops.h b/third_party/fwkacllib/inc/ops/string_ops.h index 20926748..9b87817f 100644 --- a/third_party/fwkacllib/inc/ops/string_ops.h +++ b/third_party/fwkacllib/inc/ops/string_ops.h @@ -21,6 +21,28 @@ #include "graph/operator_reg.h" namespace ge { + +/** +*@brief Split elements of input based on delimiter into a SparseTensor. + +*@par Inputs: +include: \n +*@li input:1-D. Strings to split. +*@li delimiter:0-D. Delimiter characters (bytes), or empty string. + +*@par Attributes: +* skip_empty:A bool. If True, skip the empty strings from the result. + +*@par Outputs: +*@li indices:A dense matrix of int64 representing the indices of the sparse tensor. +*@li values:A vector of strings corresponding to the splited values. +*@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n +*where the first value is N and the second value is the maximum number of tokens\n +*in a single input entry. + +*@see StringSplit() + +*/ REG_OP(StringSplit) .INPUT(input, TensorType({DT_STRING})) .INPUT(delimiter, TensorType({DT_STRING})) @@ -30,6 +52,27 @@ REG_OP(StringSplit) .ATTR(skip_empty, Bool, true) .OP_END_FACTORY_REG(StringSplit) +/** +*@brief Split elements of source based on sep into a SparseTensor. + +*@par Inputs: +include: \n +*@li input:1-D. Strings to split. +*@li sep:0-D string Tensor, the delimiter character. + +*@par Attributes: +* maxsplit:An int. If maxsplit > 0, limit of the split of the result. + +*@par Outputs: +*@li indices:A dense matrix of int64 representing the indices of the sparse tensor. +*@li values:A vector of strings corresponding to the splited values. +*@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,\n +*where the first value is N and the second value is the maximum number of tokens\n +*in a single input entry. + +*@see StringSplitV2() + +*/ REG_OP(StringSplitV2) .INPUT(input, TensorType({DT_STRING})) .INPUT(sep, TensorType({DT_STRING})) @@ -39,11 +82,56 @@ REG_OP(StringSplitV2) .ATTR(maxsplit, Int, -1) .OP_END_FACTORY_REG(StringSplitV2) +/** +*@brief Determine the script codes of a given tensor of Unicode integer code points. + +*@par Inputs: +include: \n +*x:A Tensor of int32 Unicode code points. + +*@par Outputs: +*y:A Tensor of int32 script codes corresponding to each input code point. + +*@attention Constraints:\n +*This operation converts Unicode code points to script codes corresponding to\n +*each code point.\nScript codes correspond to International Components for\n +*Unicode (ICU) UScriptCode values.\n +*See http://icu-project.org/apiref/icu4c/uscript_8h.html.\n +*Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.\n +*Output shape will match input shape. + +*@see UnicodeScript() + +*/ REG_OP(UnicodeScript) .INPUT(x, TensorType({DT_INT32})) .OUTPUT(y, TensorType({DT_INT32})) .OP_END_FACTORY_REG(UnicodeScript) +/** +*@brief Return substrings from Tensor of strings. + +*@par Inputs: +include: \n +*@li input:Tensor of strings. +*@li pos:Scalar defining the position of first character in each substring. +*@li len:Scalar defining the number of characters to include in each substring. + +*@par Outputs: +*output:Tensor of substrings. + +*@attention Constraints:\n +*The hash function is deterministic on the content of the string within\n +*the process and will never change. However, it is not suitable for\n +*cryptography. This function may be used when CPU time is scarce and\n +*inputs are trusted or unimportant. There is a risk of adversaries\n +*constructing inputs that all hash to the same bucket.\n +*To prevent this problem, use a strong hash function with\n +*tf.string_to_hash_bucket_strong. + +*@see Substr() + +*/ REG_OP(Substr) .INPUT(input, TensorType({DT_STRING})) .INPUT(pos, TensorType({DT_INT32, DT_INT64})) @@ -51,12 +139,59 @@ REG_OP(Substr) .OUTPUT(output, TensorType({DT_STRING})) .OP_END_FACTORY_REG(Substr) +/** +*@brief Converts each string in the input Tensor to its hash mod by a number of buckets. + +*@par Inputs: +include: \n +*string_tensor:The strings to assign a hash bucket. + +*@par Outputs: +*y:A Tensor of the same shape as the input x. + +*@attention Constraints:\n +*The hash function is deterministic on the content of the string within\n +*the process and will never change. However, it is not suitable for cryptography.\n +*This function may be used when CPU time is scarce and inputs are trusted or\n +*unimportant. There is a risk of adversaries constructing inputs that all hash\n +*to the same bucket. To prevent this problem, use a strong hash function with\n +*tf.string_to_hash_bucket_strong. + +*@see StringToHashBucketFast() + +*/ REG_OP(StringToHashBucketFast) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_INT64})) .ATTR(num_buckets, Int, 1) .OP_END_FACTORY_REG(StringToHashBucketFast) +/** +*@brief Converts each string in the input Tensor to its hash mod by a number of buckets. + +*@par Inputs: +include: \n +*x:The strings to assign a hash bucket. + +*@par Attributes: +*num_buckets:The number of buckets. + +*@par Outputs: +*y:A Tensor of the same shape as the input x. + +*@attention Constraints:\n +*@li A strong hash is important when inputs may be malicious, e.g. URLs with\n +*additional components. Adversaries could try to make their inputs hash to\n +*the same bucket for a denial-of-service attack or to skew the results.\n +*A strong hash can be used to make it difficult to find inputs with a skewed\n +* hash value distribution over buckets. This requires that the hash function\ +*is seeded by a high-entropy (random) "key" unknown to the adversary. +*@li The additional robustness comes at a cost of roughly 4x higher\n +*compute time than tf.string_to_hash_bucket_fast. + +*@see StringToHashBucketStrong() + +*/ REG_OP(StringToHashBucketStrong) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_INT64})) @@ -64,23 +199,97 @@ REG_OP(StringToHashBucketStrong) .REQUIRED_ATTR(key, ListInt) .OP_END_FACTORY_REG(StringToHashBucketStrong) +/** +*@brief Converts each string in the input Tensor to its hash mod by a number of buckets. + +*@par Inputs: +include: \n +*string_tensor:The strings to assign a hash bucket. + +*@par Attributes: +*num_buckets:The number of buckets. + +*@par Outputs: +*y:A Tensor of the same shape as the input string_tensor. + +*@see StringToHashBucket() + +*/ REG_OP(StringToHashBucket) .INPUT(string_tensor, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_INT64})) .ATTR(num_buckets, Int, 1) .OP_END_FACTORY_REG(StringToHashBucket) +/** +*@brief Strip leading and trailing whitespaces from the Tensor. + +*@par Inputs: +include: \n +*x:A string Tensor of any shape. + +*@par Outputs: +*y:A string Tensor of the same shape as the input. + +*@see StringStrip() + +*/ REG_OP(StringStrip) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_STRING})) .OP_END_FACTORY_REG(StringStrip) +/** +*@brief Computes the length of each string given in the input tensor. + +*@par Inputs: +include: \n +*x:The string for which to compute the length. + +*@par Attributes: +*unit:The unit that is counted to compute string length.\n +*One of: "BYTE" (for the number of bytes in each string) or\n +*"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).\n +*Results are undefined if unit=UTF8_CHAR and the input strings do not contain\N +*structurally valid UTF-8. + +*@par Outputs: +*y:Integer tensor that has the same shape as input.\n +*The output contains the element-wise string lengths of input. + +*@see StringLength() + +*/ REG_OP(StringLength) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_INT32})) .ATTR(unit, String, "BYTE") .OP_END_FACTORY_REG(StringLength) +/** +*@brief Joins the strings in the given list of string tensors into one tensor. + +*@par Inputs: +*The input is a string tensor of any shape. The pattern is a scalar string tensor\n +*which is applied to every element of the input tensor. The boolean values\n +*(True or False) of the output tensor indicate if the input matches the regex\n +*pattern provided. The pattern follows the re2 syntax\n +*(https://github.com/google/re2/wiki/Syntax).: \n +include: \n +*x:A list of string tensors. The tensors must all have the same shape,\n +*or be scalars. Scalars may be mixed in; these will be broadcast to the shape\n +*of non-scalar inputs. + +*@par Attributes: +*@li N:The length of input x. +*@li separator:string, an optional join separator. + +*@par Outputs: +*y:The output tensor. + +*@see StringJoin() + +*/ REG_OP(StringJoin) .DYNAMIC_INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_STRING})) @@ -88,6 +297,29 @@ REG_OP(StringJoin) .ATTR(separator, String, "") .OP_END_FACTORY_REG(StringJoin) +/** +*@brief Formats a string template using a list of tensors. + +*@par Inputs: +*The input is a string tensor of any shape. The pattern is a scalar string tensor\n +*which is applied to every element of the input tensor.\n +*The boolean values (True or False) of the output tensor indicate if the input\n +*matches the regex pattern provided. The pattern follows the re2 syntax\n +*(https://github.com/google/re2/wiki/Syntax).: \n +include: \n +*x:The tensors to format into the placeholder string. + +*@par Attributes: +*@li template:A string, the template to format tensor summaries into. +*@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted. +*@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension. + +*@par Outputs: +*y:The resulting string scalar. + +*@see StringFormat() + +*/ REG_OP(StringFormat) .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \ @@ -98,12 +330,55 @@ REG_OP(StringFormat) .ATTR(summarize, Int, 3) .OP_END_FACTORY_REG(StringFormat) +/** +*@brief Check if the input matches the regex pattern. + +*@par Inputs: +*The input is a string tensor of any shape. The pattern is a scalar string tensor\n +*which is applied to every element of the input tensor. The boolean values \n +*(True or False) of the output tensor indicate if the input matches the regex\n +*pattern provided. The pattern follows the re2 syntax\n +*(https://github.com/google/re2/wiki/Syntax).: \n +include: \n +*@li x:A string tensor of the text to be processed. +*@li pattern:A scalar string tensor containing the regular expression to match the input. + +*@par Outputs: +*y:A bool tensor with the same shape as input. + +*@see RegexFullMatch() + +*/ REG_OP(RegexFullMatch) .INPUT(x, TensorType({DT_STRING})) .INPUT(pattern, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_BOOL})) .OP_END_FACTORY_REG(RegexFullMatch) +/** +*@brief Replaces matches of the pattern regular expression in input with the\n +*replacement string provided in rewrite. + +*@par Inputs: +*It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).: \n +include: \n +*@li x:The text to be processed. +*@li pattern:The regular expression to be matched in the input strings. +*@li rewrite:The rewrite string to be substituted for the pattern expression\n +*where it is matched in the input strings. + +*@par Attributes: +*replace_global:If True, the replacement is global\n +*(that is, all matches of the pattern regular expression in each input string\n +*are rewritten), otherwise the rewrite substitution is only made for the first\n +* pattern match. + +*@par Outputs: +*y:The text after applying pattern match and rewrite substitution. + +*@see RegexReplace() + +*/ REG_OP(RegexReplace) .INPUT(x, TensorType({DT_STRING})) .INPUT(pattern, TensorType({DT_STRING})) @@ -112,6 +387,31 @@ REG_OP(RegexReplace) .ATTR(replace_global, Bool, true) .OP_END_FACTORY_REG(RegexReplace) +/** +*@brief Converts each entry in the given tensor to strings. + +*@par Inputs: +*Supports many numeric types and boolean.: \n +include: \n +*x:A tensor can be trans to string. + +*@par Attributes: +*@li precision:The post-decimal precision to use for floating point numbers.\n +*Only used if precision > -1. +*@li scientific:Use scientific notation for floating point numbers. +*@li shortest:Use shortest representation (either scientific or standard)\n +*for floating point numbers.. +*@li width:Pad pre-decimal numbers to this width. Applies to both floating\n +*point and integer numbers. Only used if width > -1. +*@li fill:The value to pad if width > -1. If empty, pads with spaces.\n +*Another typical value is '0'. String cannot be longer than 1 character. + +*@par Outputs: +*y:The output tensor. + +*@see AsString() + +*/ REG_OP(AsString) .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \ DT_DOUBLE, DT_BOOL})) @@ -123,12 +423,52 @@ REG_OP(AsString) .ATTR(fill, String, "") .OP_END_FACTORY_REG(AsString) +/** +*@brief Encode strings into web-safe base64 format. + +*@par Inputs: +*Input may or may not have padding at the end. See EncodeBase64 for padding.\n +*Web-safe means that input must use - and _ instead of + and /.: \n +include: \n +*x:Strings to be encoded. + +*@par Attributes: +*pad:Bool whether padding is applied at the ends. + +*@par Outputs: +*y:Input strings encoded in base64. + +*@attention Constraints:\n +*Refer to the following article for more information on base64 format:\n +*en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='\n +*at the end so that the encoded has length multiple of 4.\n +*See Padding section of the link above. Web-safe means that the encoder\n +*uses - and _ instead of + and /. + +*@see EncodeBase64() + +*/ REG_OP(EncodeBase64) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_STRING})) .ATTR(pad, Bool, false) .OP_END_FACTORY_REG(EncodeBase64) +/** +*@brief Decode web-safe base64-encoded strings. + +*@par Inputs: +*Input may or may not have padding at the end. See EncodeBase64 for padding.\n +*Web-safe means that input must use - and _ instead of + and /.: \n +include: \n +*x:Base64 strings to decode. + +*@par Outputs: +*y:Decoded strings. + +*@see DecodeBase64() + +*/ REG_OP(DecodeBase64) .INPUT(x, TensorType({DT_STRING})) .OUTPUT(y, TensorType({DT_STRING})) diff --git a/third_party/fwkacllib/inc/ops/swap_co_ops.h b/third_party/fwkacllib/inc/ops/swap_co_ops.h new file mode 100644 index 00000000..02f1451b --- /dev/null +++ b/third_party/fwkacllib/inc/ops/swap_co_ops.h @@ -0,0 +1,56 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_OP_SWAP_CO_OPS_H_ +#define GE_OP_SWAP_CO_OPS_H_ + +#include "graph/operator_reg.h" + +namespace ge { + +/** +*@brief Folds the convolution input weight constant of the preceding layer \n +* of PSROIPooling to convert the N dimension of the weight from \n +* (output_dim, group_size*group_size) to \n +* (group_size*group_size, int((output_dim+15)/C0)*C0). +*@see PSROIPooling + +*@par Inputs: +* One input: +*x: An NCHW tensor of type float16 or float32, describing the weight of\n +* convolution. Dim N must equal output_dim*group_size*group_size. + +*@par Attributes: +*@li output_dim: A required int32, specifying the number of output channels.\n +* Must be greater than "0". +*@li group_size: A required int32, specifying the number of groups to encode\n +* position-sensitive score maps. Must be within the range (0, 128). + +*@par Outputs: +*y: An NCHW tensor of type float16 or float32, describing the result weight\n +* of convolution. +*/ + +REG_OP(SwapCo) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(output_dim, Int, 0) + .ATTR(group_size, Int, 0) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .OP_END_FACTORY_REG(SwapCo) + +} // namespace ge + +#endif // GE_OP_SWAP_CO_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h index a821656a..689cde4e 100644 --- a/third_party/fwkacllib/inc/ops/transformation_ops.h +++ b/third_party/fwkacllib/inc/ops/transformation_ops.h @@ -47,7 +47,7 @@ REG_OP(DepthwiseWeight6DTo4D) REG_OP(TransposeD) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) - .ATTR(perm, ListInt, {}) + .REQUIRED_ATTR(perm, ListInt) .OP_END_FACTORY_REG(TransposeD) /** @@ -67,6 +67,37 @@ REG_OP(Transpose) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(Transpose) +/** +*@brief Permutes the dimensions according to order.\n + The returned tensor's dimension i will correspond to the input dimension order[i]. + +*@par Inputs: +*x: A Tensor. Must be one of the following types: float16, float32. + +*@par Attributes: +*order: A permutation of the dimensions of "x".support any axis transformation + +*@par Outputs: +*y: A Tensor. Has the same type as "x". +*/ +REG_OP(Permute) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(order, ListInt, {0}) + .OP_END_FACTORY_REG(Permute) + +/** +*@brief Flattens the inputs. Reserves axis 0 and flattens the input tensors along axis 1. + +*@par Inputs: +*One input: \n +*x: A multi-dimensional Tensor. Must be one of the following types: \n +int8, uint8, int16, uint16, int32, int64, float16, float32, float64. + +*@par Outputs: +*y: A 2D flattened Tensor (Reserves axis 0 and flattens the input tensors along axis 1). Must be one of the following data types: int8, uint8, int16, uint16, int32, int64, float16, float32, float64. + +*/ REG_OP(Flatten) .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64, @@ -76,6 +107,19 @@ REG_OP(Flatten) DT_FLOAT, DT_FLOAT16})) .OP_END_FACTORY_REG(Flatten) +/** +*@brief Permutes and crops the input tensor. + +*@par Inputs: +* Three inputs, including: \n +*@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0. +*@li block_shape: A 1D list or tuple of int32 or int64. +*@li crops: A 2D list or tuple of int32 or int64. Specifies the amount to crop from start and end dimensions after permutation. + +*@par Outputs: +*y: A Tensor with format NC1HWC0. Has the same type as input "x". + +*/ REG_OP(BatchToSpaceND) .INPUT(x, TensorType::BasicType()) .INPUT(block_shape, TensorType::IndexNumberType()) @@ -83,6 +127,22 @@ REG_OP(BatchToSpaceND) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(BatchToSpaceND) +/** +*@brief Permutes and crops the input tensor. + +*@par Inputs: +* One input: \n +*x: A 5D Tensor of type float16 or float32, with format NC1HWC0. + +*@par Attributes: +*@li block_shape: A required 1D list or tuple of int32 or int64. +*@li crops: A required 2D list or tuple of int32 or int64. Specifies the amount to crop from the start and end dimensions after permutation. + +*@par Outputs: +*y: A Tensor with format NC1HWC0. Has the same type as input "x". + + +*/ REG_OP(BatchToSpaceNDD) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) @@ -90,6 +150,19 @@ REG_OP(BatchToSpaceNDD) .REQUIRED_ATTR(crops, ListInt) .OP_END_FACTORY_REG(BatchToSpaceNDD) +/** +*@brief Pads and permutes the input tensor. + +*@par Inputs: +* Three inputs, including: \n +*@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0. +*@li block_shape: A 1D list or tuple of int32 or int64. +*@li paddings: A 2D list or tuple of int32 or int64. Specifies the padding for the start and end dimensions after permutation. + +*@par Outputs: +*y: A Tensor with format NC1HWC0. Has the same type as input "x". + +*/ REG_OP(SpaceToBatchND) .INPUT(x, TensorType::BasicType()) .INPUT(block_shape, TensorType::IndexNumberType()) @@ -97,6 +170,21 @@ REG_OP(SpaceToBatchND) .OUTPUT(y, TensorType::BasicType()) .OP_END_FACTORY_REG(SpaceToBatchND) +/** +*@brief Pads and permutes the input tensor. + +*@par Inputs: +* One input: \n +*x: A 5D Tensor of type float16 or float32, with format NC1HWC0. + +*@par Attributes: +*@li block_shape: A required 1D list or tuple of int32 or int64. +*@li paddings: A required 2D list or tuple of int32 or int64. Specifies the padding for the start and end dimensions after permutation. + +*@par Outputs: +*y: A Tensor with format NC1HWC0. Has the same type as input "x". + +*/ REG_OP(SpaceToBatchNDD) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) @@ -104,6 +192,21 @@ REG_OP(SpaceToBatchNDD) .REQUIRED_ATTR(paddings, ListInt) .OP_END_FACTORY_REG(SpaceToBatchNDD) +/** +*@brief Outputs a copy of the input tensor where values from the "height" and "width" dimensions are moved to the "depth" dimension. + +*@par Inputs: +*x: An NHWC Tensor. Must be one of the following types: +* float16, float32, double, int64, int32, uint8, uint16, uint32, uint64, int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32. + + +*@par Attributes: +*@li block_size: A required int, specifying the input block size. +*@li data_format: An optional string from "NHWC" and "NCHW" + +*@par Outputs: +*y: A Tensor. Has the same type as input "x". +*/ REG_OP(SpaceToDepth) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) @@ -196,6 +299,21 @@ REG_OP(BatchToSpaceD) .REQUIRED_ATTR(crops, ListInt) .OP_END_FACTORY_REG(BatchToSpaceD) +/** +*@brief Outputs a copy of the input tensor where values from the "height" and "width" dimensions are padded and rearranged to the "batch" dimension. + +*@par Inputs: +*@li x: An NC1HWC0 Tensor. Must be one of the following types: +* float16, float32, double, int64, int32, uint8, uint16, uint32, uint64, int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32. + +*@li paddings: A 2D tensor of type int, specifying the input. + +*@par Attributes: +*block_size: A required int, specifying the input block size. + +*@par Outputs: +*y: A Tensor. Has the same type as input "x". +*/ REG_OP(SpaceToBatch) .INPUT(x, TensorType::BasicType()) .INPUT(paddings, TensorType::IndexNumberType()) @@ -203,6 +321,20 @@ REG_OP(SpaceToBatch) .REQUIRED_ATTR(block_size, Int) .OP_END_FACTORY_REG(SpaceToBatch) +/** +*@brief Outputs a copy of the input tensor where values from the "height" and "width" dimensions are padded and rearranged to the "batch" dimension. + +*@par Inputs: +*x: An NC1HWC0 Tensor. Must be one of the following types: float16, float32, double, int64, int32, uint8, uint16, uint32, uint64, int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32. + + +*@par Attributes: +*@li block_size: A required int, specifying the input block size. +*@li paddings: A 2D tensor. All data types are supported. + +*@par Outputs: +*y: A Tensor. Has the same type as input "x". +*/ REG_OP(SpaceToBatchD) .INPUT(x, TensorType::BasicType()) .OUTPUT(y, TensorType::BasicType()) @@ -211,11 +343,11 @@ REG_OP(SpaceToBatchD) .OP_END_FACTORY_REG(SpaceToBatchD) /** -* @brief Unpacks the given dimension of a rank-R tensor "value" into rank-(R-1) +* @brief Unpacks the given dimension of a rank-R tensor "x" into rank-(R-1) * tensors. * @par Inputs: -* @ value: A rank-R tensor (R > 0) of type BasicType, with format ND or NC1HWC0. +* @ x: A rank-R tensor (R > 0) of type BasicType, with format ND or NC1HWC0. * @par Attributes: * @li num: An optional int, specifying the number of tensors to be unpacked to. @@ -224,16 +356,16 @@ REG_OP(SpaceToBatchD) * is [-R, R). * @par Outputs: -* output: The list of Tensor objects unpacked from "value", of type BasicType. +* y: The list of Tensor objects unpacked from "x", of type BasicType. * @attention Constraints: -* @li If "num" is not specified, it is inferred from the shape of "value". +* @li If "num" is not specified, it is inferred from the shape of "x". * @li For the ND format, "axis" is in the range [-R, R); For the NC1HWC0 format, * "axis" must not be 2, 3, -2, or -3. */ REG_OP(Unpack) - .INPUT(value, TensorType::BasicType()) - .DYNAMIC_OUTPUT(output, TensorType::BasicType()) + .INPUT(x, TensorType::BasicType()) + .DYNAMIC_OUTPUT(y, TensorType::BasicType()) .REQUIRED_ATTR(num, Int) .ATTR(axis, Int, 0) .OP_END_FACTORY_REG(Unpack) @@ -243,21 +375,20 @@ REG_OP(Unpack) * dimension of the output. * @par Inputs: -* images: A 4D Tensor with shape [batch, in_rows, in_cols, depth]. +* x: A 4D Tensor with shape [batch, in_rows, in_cols, depth]. * @par Attributes: -* @li ksizes: An optional tuple or list. size of the sliding window for -* each dimension of images. -* @li strides: An optional tuple or list. How far the centers of two -* consecutive patches are in the images.\n -* Must be: [1, stride_rows, stride_cols, 1]. -* @li rates: Must be: An optional tuple or list. [1, rate_rows, rate_cols, 1]. -* This is the input stride,\n -* specifying how far two consecutive patch samples are in the input. Equivalent\n -* to extracting patches with patch_sizes_eff = patch_sizes + (patch_sizes - 1) *\n -* (rates - 1), followed by subsampling them spatially by a factor of rates. This\n -* is equivalent to rate in dilated (a.k.a. Atrous) convolutions. -* @li padding: An optional string. The type of padding algorithm to use. +* @li ksizes: A required list or tuple. The size of the sliding window for each +* dimension of images. +* @li strides: A required list or tuple. How far the centers of two consecutive +* patches are in the images. Must be: [1, stride_rows, stride_cols, 1]. +* @li rates: A required list or tuple. Must be: [1, rate_rows, rate_cols, 1]. \n +* This is the input stride, specifying how far two consecutive patch \n +* samples are in the input. Equivalent to extracting patches +* with patch_sizes_eff = patch_sizes + (patch_sizes - 1) *\n +* (rates - 1), followed by subsampling them spatially by a factor of rates. \n +* This is equivalent to rate in dilated (a.k.a. Atrous) convolutions. +* @li padding: A required string. The type of padding algorithm to use. * @par Outputs: * Output: A 4D Tensor with shape [batch, out_rows, out_cols, ksize_rows *\n @@ -269,12 +400,12 @@ REG_OP(Unpack) * "ksizes", "strides" and "rates" are lists of integers. */ REG_OP(ExtractImagePatches) - .INPUT(images, TensorType::REALNUMBERTYPE()) + .INPUT(x, TensorType::REALNUMBERTYPE()) .OUTPUT(y, TensorType::REALNUMBERTYPE()) - .ATTR(ksizes, ListInt, {1,3,3,1}) - .ATTR(strides, ListInt, {1,1,1,1}) - .ATTR(rates, ListInt, {1,1,1,1}) - .ATTR(padding, String, "SAME") + .REQUIRED_ATTR(ksizes, ListInt) + .REQUIRED_ATTR(strides, ListInt) + .REQUIRED_ATTR(rates, ListInt) + .REQUIRED_ATTR(padding, String) .OP_END_FACTORY_REG(ExtractImagePatches) /** @@ -321,6 +452,35 @@ REG_OP(ConfusionTranspose) .REQUIRED_ATTR(transpose_first, Bool) .OP_END_FACTORY_REG(ConfusionTranspose) +/** +*@brief Flattens the input tensor to one-dimensional. + +*@par Inputs: +*x: An ND tensor. All data types are supported. + +*@par Attributes: +*@li axis: An optional int32, specifying the first axis to flatten. All preceding axes are retained in the output. Defaults to "1". +*@li end_axis: An optional int32, specifying the last axis to flatten. All following axes are retained in the output. Defaults to "-1". + +*@par Outputs: +*y: The flattened ND tensor. All data types are supported. + +*@attention Constraints: +* "axis" and "end_axis" must be within the dimension range of the input. +*/ +REG_OP(FlattenV2) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, + DT_INT32, DT_UINT32, DT_INT64, DT_UINT64})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, + DT_INT32, DT_UINT32, DT_INT64, DT_UINT64})) + .ATTR(axis, Int, 1) + .ATTR(end_axis, Int, -1) + .OP_END_FACTORY_REG(FlattenV2) + +REG_OP(DeConvTrans) + .INPUT(x, TensorType({DT_INT8})) + .OUTPUT(y, TensorType({DT_INT8})) + .OP_END_FACTORY_REG(DeConvTrans) } // namespace ge #endif // GE_OP_TRANSFORMATION_OPS_H diff --git a/third_party/fwkacllib/inc/register/op_kernel_registry.h b/third_party/fwkacllib/inc/register/op_kernel_registry.h new file mode 100644 index 00000000..47bdca07 --- /dev/null +++ b/third_party/fwkacllib/inc/register/op_kernel_registry.h @@ -0,0 +1,48 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INC_REGISTER_OP_KERNEL_REGISTRY_H_ +#define INC_REGISTER_OP_KERNEL_REGISTRY_H_ +#include +#include +#include "register/register.h" + +namespace ge { +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry { + public: + using CreateFn = HostCpuOp* (*)(); + ~OpKernelRegistry(); + + static OpKernelRegistry& GetInstance() { + static OpKernelRegistry instance; + return instance; + } + + bool IsRegistered(const std::string &op_type); + + void RegisterHostCpuOp(const std::string &op_type, CreateFn create_fn); + + std::unique_ptr CreateHostCpuOp(const std::string &op_type); + + private: + OpKernelRegistry(); + class OpKernelRegistryImpl; + /*lint -e148*/ + std::unique_ptr impl_; +}; +} // namespace ge + +#endif // INC_REGISTER_OP_KERNEL_REGISTRY_H_ diff --git a/third_party/fwkacllib/inc/register/op_registry.h b/third_party/fwkacllib/inc/register/op_registry.h index 74473868..4dd1dc5b 100644 --- a/third_party/fwkacllib/inc/register/op_registry.h +++ b/third_party/fwkacllib/inc/register/op_registry.h @@ -17,6 +17,7 @@ #ifndef INC_REGISTER_OP_REGISTRY_H_ #define INC_REGISTER_OP_REGISTRY_H_ +#include #include #include #include @@ -25,7 +26,7 @@ #include "register/register.h" namespace domi { -enum OmgMoveTypeToAttr { +enum RemoveInputType { OMG_MOVE_TYPE_DTYPE = 0, OMG_MOVE_TYPE_VALUE, OMG_MOVE_TYPE_SHAPE, @@ -33,13 +34,15 @@ enum OmgMoveTypeToAttr { OMG_MOVE_TYPE_AXIS, OMG_MOVE_TYPE_SCALAR_VALUE, OMG_REMOVE_TYPE_WITH_COND = 1000, + OMG_REMOVE_INPUT_WITH_ORIGINAL_TYPE, }; -struct MoveInputToAttrStu { - int inputIdx; +struct RemoveInputConfigure { + int inputIdx = INT_MAX; std::string attrName; - OmgMoveTypeToAttr moveType; - bool attrValue; + RemoveInputType moveType; + bool attrValue = false; + std::string originalType; }; class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry { @@ -56,15 +59,18 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry { domi::ParseParamFunc GetParseParamFunc(const std::string &op_type); + domi::FusionParseParamFunc GetFusionParseParamFunc(const std::string &op_type); + domi::ImplyType GetImplyTypeByOriOpType(const std::string &ori_optype); - const std::vector &GetConstInputToAttr(const std::string &ori_optype) const; + const std::vector &GetRemoveInputConfigure(const std::string &ori_optype) const; private: std::unordered_map> op_ori_optype_map_; std::unordered_map op_run_mode_map_; std::unordered_map opParseParamsFnMap_; - std::unordered_map> opConstInputToAttrMap_; + std::unordered_map fusionOpParseParamsFnMap_; + std::unordered_map> remove_input_configure_map_; std::unordered_map originOpType2OmOpType_; }; } // namespace domi diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h index 724141cc..a1b3d762 100644 --- a/third_party/fwkacllib/inc/runtime/base.h +++ b/third_party/fwkacllib/inc/runtime/base.h @@ -21,7 +21,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif // If you need export the function of this library in Win32 dll, use __declspec(dllexport) #ifndef RTS_API @@ -29,45 +29,46 @@ extern "C" { #define RTS_API __declspec(dllexport) #else #define RTS_API -#endif // RTS_DLL_EXPORT -#endif // RTS_API +#endif +#endif /** * @ingroup dvrt_base * @brief runtime error numbers. */ typedef enum tagRtError { - RT_ERROR_NONE = 0x0, // succes - RT_ERROR_INVALID_VALUE = 0x1, // invalid value - RT_ERROR_MEMORY_ALLOCATION = 0x2, // memory allocation fail - RT_ERROR_INVALID_RESOURCE_HANDLE = 0x3, // invalid handle - RT_ERROR_INVALID_DEVICE_POINTER = 0x4, // invalid device point - RT_ERROR_INVALID_MEMCPY_DIRECTION = 0x5, // invalid memory copy dirction - RT_ERROR_INVALID_DEVICE = 0x6, // invalid device - RT_ERROR_NO_DEVICE = 0x7, // no valid device - RT_ERROR_CMD_OCCUPY_FAILURE = 0x8, // command occpuy failure - RT_ERROR_SET_SIGNAL_FAILURE = 0x9, // set signal failure - RT_ERROR_UNSET_SIGNAL_FAILURE = 0xA, // unset signal failure - RT_ERROR_OPEN_FILE_FAILURE = 0xB, // unset signal failure - RT_ERROR_WRITE_FILE_FAILURE = 0xC, - RT_ERROR_MEMORY_ADDRESS_UNALIGNED = 0xD, - RT_ERROR_DRV_ERR = 0xE, - RT_ERROR_LOST_HEARTBEAT = 0xF, - RT_ERROR_REPORT_TIMEOUT = 0x10, - RT_ERROR_NOT_READY = 0x11, - RT_ERROR_DATA_OPERATION_FAIL = 0x12, - RT_ERROR_INVALID_L2_INSTR_SIZE = 0x13, - RT_ERROR_DEVICE_PROC_HANG_OUT = 0x14, - RT_ERROR_DEVICE_POWER_UP_FAIL = 0x15, - RT_ERROR_DEVICE_POWER_DOWN_FAIL = 0x16, - RT_ERROR_FEATURE_NOT_SUPPROT = 0x17, - RT_ERROR_KERNEL_DUPLICATE = 0x18, // register same kernel repeatly - RT_ERROR_MODEL_STREAM_EXE_FAILED = 0x91, // the model stream failed - RT_ERROR_MODEL_LOAD_FAILED = 0x94, // the model stream failed - RT_ERROR_END_OF_SEQUENCE = 0x95, // end of sequence - RT_ERROR_NO_STREAM_CB_REG = 0x96, // no callback register info for stream - RT_ERROR_DATA_DUMP_LOAD_FAILED = 0x97, // data dump load info fail - RT_ERROR_RESERVED + RT_ERROR_NONE = 0x0, // success + RT_ERROR_INVALID_VALUE = 0x1, // invalid value + RT_ERROR_MEMORY_ALLOCATION = 0x2, // memory allocation fail + RT_ERROR_INVALID_RESOURCE_HANDLE = 0x3, // invalid handle + RT_ERROR_INVALID_DEVICE_POINTER = 0x4, // invalid device point + RT_ERROR_INVALID_MEMCPY_DIRECTION = 0x5, // invalid memory copy dirction + RT_ERROR_INVALID_DEVICE = 0x6, // invalid device + RT_ERROR_NO_DEVICE = 0x7, // no valid device + RT_ERROR_CMD_OCCUPY_FAILURE = 0x8, // command occpuy failure + RT_ERROR_SET_SIGNAL_FAILURE = 0x9, // set signal failure + RT_ERROR_UNSET_SIGNAL_FAILURE = 0xA, // unset signal failure + RT_ERROR_OPEN_FILE_FAILURE = 0xB, // unset signal failure + RT_ERROR_WRITE_FILE_FAILURE = 0xC, + RT_ERROR_MEMORY_ADDRESS_UNALIGNED = 0xD, + RT_ERROR_DRV_ERR = 0xE, + RT_ERROR_LOST_HEARTBEAT = 0xF, + RT_ERROR_REPORT_TIMEOUT = 0x10, + RT_ERROR_NOT_READY = 0x11, + RT_ERROR_DATA_OPERATION_FAIL = 0x12, + RT_ERROR_INVALID_L2_INSTR_SIZE = 0x13, + RT_ERROR_DEVICE_PROC_HANG_OUT = 0x14, + RT_ERROR_DEVICE_POWER_UP_FAIL = 0x15, + RT_ERROR_DEVICE_POWER_DOWN_FAIL = 0x16, + RT_ERROR_FEATURE_NOT_SUPPROT = 0x17, + RT_ERROR_KERNEL_DUPLICATE = 0x18, // register same kernel repeatly + RT_ERROR_MODEL_STREAM_EXE_FAILED = 0x91, // the model stream failed + RT_ERROR_MODEL_LOAD_FAILED = 0x94, // the model stream failed + RT_ERROR_END_OF_SEQUENCE = 0x95, // end of sequence + RT_ERROR_NO_STREAM_CB_REG = 0x96, // no callback register info for stream + RT_ERROR_DATA_DUMP_LOAD_FAILED = 0x97, // data dump load info fail + RT_ERROR_CALLBACK_THREAD_UNSUBSTRIBE = 0x98, // callback thread unsubstribe + RT_ERROR_RESERVED } rtError_t; /** diff --git a/third_party/fwkacllib/inc/runtime/config.h b/third_party/fwkacllib/inc/runtime/config.h index c7301a99..e5d5d360 100644 --- a/third_party/fwkacllib/inc/runtime/config.h +++ b/third_party/fwkacllib/inc/runtime/config.h @@ -21,7 +21,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif #define PLAT_COMBINE(arch, chip, ver) ((arch << 16) | (chip << 8) | (ver)) #define PLAT_GET_ARCH(type) ((type >> 16) & 0xffff) @@ -52,7 +52,7 @@ typedef enum tagRtVersion { VER_END, } rtVersion_t; -// match rtChipType_t +/* match rtChipType_t */ typedef enum tagRtPlatformType { PLATFORM_BEGIN = 0, PLATFORM_MINI_V1 = PLATFORM_BEGIN, diff --git a/third_party/fwkacllib/inc/runtime/context.h b/third_party/fwkacllib/inc/runtime/context.h index 016abec1..54621e86 100644 --- a/third_party/fwkacllib/inc/runtime/context.h +++ b/third_party/fwkacllib/inc/runtime/context.h @@ -21,7 +21,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /** * @ingroup rt_context @@ -29,7 +29,10 @@ extern "C" { */ typedef void *rtContext_t; -typedef enum tagDryRunFlag { RT_DRYRUN_FLAG_FALSE = 0, RT_DRYRUN_FLAG_TRUE = 1 } rtDryRunFlag_t; +typedef enum tagDryRunFlag { + RT_DRYRUN_FLAG_FALSE = 0, + RT_DRYRUN_FLAG_TRUE = 1, +} rtDryRunFlag_t; typedef enum tagCtxMode { RT_CTX_NORMAL_MODE = 0, @@ -64,6 +67,14 @@ RTS_API rtError_t rtCtxCreateEx(rtContext_t *ctx, uint32_t flags, int32_t device */ RTS_API rtError_t rtCtxDestroy(rtContext_t ctx); +/** + * @ingroup rt_context + * @brief destroy context instance + * @param [in] ctx context to destroy + * @return RT_ERROR_NONE for ok + */ +RTS_API rtError_t rtCtxDestroyEx(rtContext_t ctx); + /** * @ingroup rt_context * @brief binds context to the calling CPU thread. diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h index b171ff73..6f5ff62b 100644 --- a/third_party/fwkacllib/inc/runtime/dev.h +++ b/third_party/fwkacllib/inc/runtime/dev.h @@ -21,10 +21,10 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif typedef struct tagRTDeviceInfo { - uint8_t env_type; /* 0: FPGA 1: EMU 2: ESL */ + uint8_t env_type; // 0: FPGA 1: EMU 2: ESL uint32_t ctrl_cpu_ip; uint32_t ctrl_cpu_id; uint32_t ctrl_cpu_core_num; @@ -41,6 +41,13 @@ typedef struct tagRTDeviceInfo { #endif } rtDeviceInfo_t; +typedef enum tagRtRunMode { + RT_RUN_MODE_OFFLINE = 0, + RT_RUN_MODE_ONLINE = 1, + RT_RUN_MODE_AICPU_SCHED = 2, + RT_RUN_MODE_RESERVED +} rtRunMode; + /** * @ingroup dvrt_dev * @brief get total device number. @@ -198,6 +205,14 @@ RTS_API rtError_t rtSetExceptCallback(rtErrorCallback callback); */ RTS_API rtError_t rtSetTSDevice(uint32_t tsId); +/** + * @ingroup dvrt_dev + * @brief init aicpu executor + * @param [out] runtime run mode + * @return RT_ERROR_NONE for ok + * @return RT_ERROR_DRV_ERR for can not get run mode + */ +RTS_API rtError_t rtGetRunMode(rtRunMode *mode); #ifdef __cplusplus } #endif diff --git a/third_party/fwkacllib/inc/runtime/dvfsprofile.h b/third_party/fwkacllib/inc/runtime/dvfsprofile.h index c896a31e..11081546 100644 --- a/third_party/fwkacllib/inc/runtime/dvfsprofile.h +++ b/third_party/fwkacllib/inc/runtime/dvfsprofile.h @@ -21,7 +21,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif typedef enum dvfsProfileMode { DVFS_PROFILE_PERFORMANCE_PRIORITY, diff --git a/third_party/fwkacllib/inc/runtime/event.h b/third_party/fwkacllib/inc/runtime/event.h index 07201762..fbc4d759 100644 --- a/third_party/fwkacllib/inc/runtime/event.h +++ b/third_party/fwkacllib/inc/runtime/event.h @@ -21,7 +21,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /** * @ingroup dvrt_event diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h index a2c75bd3..1609519f 100644 --- a/third_party/fwkacllib/inc/runtime/kernel.h +++ b/third_party/fwkacllib/inc/runtime/kernel.h @@ -22,22 +22,22 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /** * @ingroup rt_kernel * @brief shared memory data control */ typedef struct tagRtSmData { - uint64_t L2_mirror_addr; // preload or swap source address - uint32_t L2_data_section_size; // every data size - uint8_t L2_preload; // 1 - preload from mirrorAddr, 0 - no preload - uint8_t modified; // 1 - data will be modified by kernel, 0 - no modified - uint8_t priority; // data priority - int8_t prev_L2_page_offset_base; // remap source section offset - uint8_t L2_page_offset_base; // remap destination section offset - uint8_t L2_load_to_ddr; // 1 - need load out, 0 - no need - uint8_t reserved[2]; // reserved + uint64_t L2_mirror_addr; // preload or swap source address + uint32_t L2_data_section_size; // every data size + uint8_t L2_preload; // 1 - preload from mirrorAddr, 0 - no preload + uint8_t modified; // 1 - data will be modified by kernel, 0 - no modified + uint8_t priority; // data priority + int8_t prev_L2_page_offset_base; // remap source section offset + uint8_t L2_page_offset_base; // remap destination section offset + uint8_t L2_load_to_ddr; // 1 - need load out, 0 - no need + uint8_t reserved[2]; // reserved } rtSmData_t; /** @@ -48,8 +48,8 @@ typedef struct tagRtSmCtrl { rtSmData_t data[8]; // data description uint64_t size; // max page Num uint8_t remap[64]; /* just using for static remap mode, default:0xFF - * array index: virtual l2 page id, array value: physic l2 page id */ - uint8_t l2_in_main; // 0-DDR, 1-L2, default:0xF + array index: virtual l2 page id, array value: physic l2 page id */ + uint8_t l2_in_main; // 0-DDR, 1-L2, default:0xFF uint8_t reserved[3]; } rtSmDesc_t; @@ -60,12 +60,30 @@ typedef rtSmDesc_t rtL2Ctrl_t; * @brief device binary type */ typedef struct tagRtDevBinary { - uint32_t magic; /**< magic number */ - uint32_t version; /**< version of binary */ - const void *data; /**< binary data */ - uint64_t length; /**< binary length */ + uint32_t magic; // magic number + uint32_t version; // version of binary + const void *data; // binary data + uint64_t length; // binary length } rtDevBinary_t; +/** + * @ingroup rt_kernel + * @brief function mode type + */ +#define ONLINE_PROF_MAX_PMU_NUM (8) + +typedef struct ProfilefDataInfo { + const void *stubFunc; + uint32_t blockDim; + const void *args; + uint32_t argsSize; + rtSmDesc_t *smDesc; + rtStream_t stream; + uint64_t totalcycle; + uint64_t ovcycle; + uint64_t pmu_cnt[ONLINE_PROF_MAX_PMU_NUM]; +} rtProfDataInfo_t; + /** * @ingroup rt_kernel * @brief function mode type @@ -109,6 +127,12 @@ typedef enum tagRtDumpKind { */ typedef rtError_t (*rtKernelReportCallback)(rtStream_t stream, rtKernelInfo_t kernelInfo); +/** + * @ingroup rt_kernel + * @brief stream report callback + */ +typedef void (*rtCallback_t)(void *fnData); + /** * @ingroup rt_kernel * @brief magic number of plain binary for aicore @@ -149,8 +173,8 @@ typedef rtError_t (*rtKernelReportCallback)(rtStream_t stream, rtKernelInfo_t ke * @ingroup rt_kernel_flags * @brief kernel op bit flags */ -#define RT_KERNEL_DEFAULT (0x00) -#define RT_KERNEL_CONVERT (0x01) +#define RT_KERNEL_DEFAULT (0x00) +#define RT_KERNEL_CONVERT (0x01) #define RT_KERNEL_DUMPFLAG (0x02) /** @@ -224,6 +248,14 @@ RTS_API rtError_t rtFunctionRegister(void *binHandle, const void *stubFunc, cons */ RTS_API rtError_t rtGetFunctionByName(const char *stubName, void **stubFunc); +/** + * @ingroup rt_kernel + * @brief find addr by stub func + * @param [in] stubFunc stub function + * @param [out] addr + * @return RT_ERROR_NONE for ok + */ +RTS_API rtError_t rtGetAddrByFun(const void *stubFunc, void **addr); /** * @ingroup rt_kernel * @brief query registered or not by stubName @@ -283,7 +315,6 @@ RTS_API rtError_t rtKernelLaunchWithFlag(const void *stubFunc, uint32_t blockDim */ RTS_API rtError_t rtKernelLaunchEx(void *args, uint32_t argsSize, uint32_t flags, rtStream_t stream); - /** * @ingroup rt_kernel * @brief launch cpu kernel to device @@ -312,9 +343,9 @@ RTS_API rtError_t rtCpuKernelLaunch(const void *soName, const void *kernelName, * @param [in] flag dump flag or others function flag * @retval RT_ERROR_NONE for ok, errno for failed */ -RTS_API rtError_t rtCpuKernelLaunchWithFlag(const void *soName, const void *kernelName, uint32_t blockDim, - const void *args, uint32_t argsSize, rtSmDesc_t *smDesc, - rtStream_t stream, uint32_t flags); +RTS_API rtError_t rtCpuKernelLaunchWithFlag(const void *soName, const void *kernelName, uint32_t blockDim, + const void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream, + uint32_t flags); /** * @ingroup rt_kernel @@ -400,8 +431,62 @@ RTS_API rtError_t rtKernelFusionEnd(rtStream_t stream); */ RTS_API rtError_t rtSetKernelReportCallback(rtKernelReportCallback callBack); +/** + * @ingroup rt_kernel + * @brief subscribe stream callback report. + * @param [in] threadId thread id for stream + * @param [in] stream stream for subscribe + * @return RT_ERROR_NONE for ok, errno for failed + */ +RTS_API rtError_t rtSubscribeReport(uint64_t threadId, rtStream_t stream); + +/** + * @ingroup rt_kernel + * @brief add callback launch task in stream. + * @param [in] callBackFunc app callback function + * @param [in] fnData user data + * @param [in] stream subscribed stream + * @return RT_ERROR_NONE for ok, errno for failed + */ +RTS_API rtError_t rtCallbackLaunch(rtCallback_t callBackFunc, void *fnData, rtStream_t stream); + +/** + * @ingroup rt_kernel + * @brief process callback report. + * @param [in] timeout if timeout=-1, while(1); else timeout + * @return RT_ERROR_NONE for ok, errno for failed + */ +RTS_API rtError_t rtProcessReport(int32_t timeout); + +/** + * @ingroup rt_kernel + * @brief unsubscribe callback report. + * @param [in] threadId thread id for stream + * @param [in] stream stream for subscribe + * @return RT_ERROR_NONE for ok, errno for failed + */ +RTS_API rtError_t rtUnSubscribeReport(uint64_t threadId, rtStream_t stream); + +/** + * @ingroup profiling_base + * @brief start online prof. + */ +RTS_API rtError_t rtStartOnlineProf(rtStream_t stream, uint32_t sampleNum); + +/** + * @ingroup profiling_base + * @brief stop online prof. + */ +RTS_API rtError_t rtStopOnlineProf(rtStream_t stream); + +/** + * @ingroup profiling_base + * @brief get online prof. + */ +RTS_API rtError_t rtGetOnlineProfData(rtStream_t stream, rtProfDataInfo_t *pProfData, uint32_t profDataNum); #ifdef __cplusplus } #endif #endif // __CCE_RUNTIME_KERNEL_H__ + diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h index ab740b11..b55530a1 100644 --- a/third_party/fwkacllib/inc/runtime/mem.h +++ b/third_party/fwkacllib/inc/runtime/mem.h @@ -17,38 +17,40 @@ #ifndef __CCE_RUNTIME_MEM_H__ #define __CCE_RUNTIME_MEM_H__ +/*lint -e7*/ #include +/*lint +e7*/ #include "base.h" #include "config.h" #include "stream.h" #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /** * @ingroup dvrt_mem * @brief memory type */ -#define RT_MEMORY_DEFAULT ((uint32_t)0x0) // default memory on device -#define RT_MEMORY_HBM ((uint32_t)0x2) // HBM memory on device -#define RT_MEMORY_DDR ((uint32_t)0x4) // DDR memory on device -#define RT_MEMORY_SPM ((uint32_t)0x8) // shared physical memory on device +#define RT_MEMORY_DEFAULT ((uint32_t)0x0) // default memory on device +#define RT_MEMORY_HBM ((uint32_t)0x2) // HBM memory on device +#define RT_MEMORY_DDR ((uint32_t)0x4) // DDR memory on device +#define RT_MEMORY_SPM ((uint32_t)0x8) // shared physical memory on device #define RT_MEMORY_P2P_HBM ((uint32_t)0x10) // HBM memory on other 4P device -#define RT_MEMORY_P2P_DDR ((uint32_t)0x11) // DDR memory on other device> -#define RT_MEMORY_DDR_NC ((uint32_t)0x20) // DDR memory of non-cache> +#define RT_MEMORY_P2P_DDR ((uint32_t)0x11) // DDR memory on other device +#define RT_MEMORY_DDR_NC ((uint32_t)0x20) // DDR memory of non-cache #define RT_MEMORY_RESERVED ((uint32_t)0x40) /** * @ingroup dvrt_mem * @brief memory Policy */ -#define RT_MEMORY_POLICY_NONE ((uint32_t)0x0) // Malloc mem prior hage page, then default page +#define RT_MEMORY_POLICY_NONE ((uint32_t)0x0) // Malloc mem prior hage page, then default page #define RT_MEMORY_POLICY_HUGE_PAGE_FIRST ((uint32_t)0x1 << 10) // Malloc mem prior hage page, then default page #define RT_MEMORY_POLICY_HUGE_PAGE_ONLY ((uint32_t)0x1 << 11) // Malloc mem only use hage page #define RT_MEMORY_POLICY_DEFAULT_PAGE_ONLY ((uint32_t)0x1 << 12) // Malloc mem only use default page -#define MEM_ALLOC_TYPE_BIT ((uint32_t)0x3FF) // mem type bit in <0, 9> +#define MEM_ALLOC_TYPE_BIT ((uint32_t)0x3FF) // mem type bit in <0, 9> /** * @ingroup dvrt_mem @@ -95,7 +97,7 @@ typedef enum tagRtDataType { typedef enum tagRtMemcpyChannelType { RT_MEMCPY_CHANNEL_TYPE_INNER = 0, // 1P RT_MEMCPY_CHANNEL_TYPE_PCIe, - RT_MEMCPY_CHANNEL_TYPE_HCCs, // not support now + RT_MEMCPY_CHANNEL_TYPE_HCCs, // not support now RT_MEMCPY_CHANNEL_TYPE_RESERVED, } rtMemcpyChannelType_t; @@ -442,6 +444,18 @@ RTS_API rtError_t rtMemSetRC(const void *devPtr, uint64_t size, uint32_t readCou */ RTS_API rtError_t rtSetIpcMemPid(const char *name, int32_t pid[], int num); +/** + * @ingroup dvrt_mem + * @brief HCCL Async memory cpy + * @param [in] dbindex single device 0 + * @param [in] dbinfo doorbell info + * @param [in] stream asynchronized task stream + * @return RT_ERROR_NONE for ok + * @return RT_ERROR_INVALID_VALUE for error input of ptr, name + * @return RT_ERROR_DRV_ERR for driver error + */ +RTS_API rtError_t rtRDMADBSend(uint32_t dbIndex, uint64_t dbInfo, rtStream_t stream); + #ifdef __cplusplus } #endif diff --git a/third_party/fwkacllib/inc/runtime/rt_model.h b/third_party/fwkacllib/inc/runtime/rt_model.h index c41a5a25..1e03e853 100644 --- a/third_party/fwkacllib/inc/runtime/rt_model.h +++ b/third_party/fwkacllib/inc/runtime/rt_model.h @@ -21,7 +21,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif typedef enum tagModelTaskType { RT_MODEL_TASK_KERNEL = 0, @@ -44,14 +44,20 @@ typedef enum tagModelTaskType { RT_MODEL_TASK_RDMA_SEND, RT_MODEL_TASK_EVENT_RESET = 18, RT_MODEL_TASK_MODEL_END_GRAPH, - RT_MODEL_TASK_STREAM_SWITCH_N + RT_MODEL_TASK_STREAM_SWITCH_N, + RT_MODEL_TASK_RDMA_DB_SEND } rtModelTaskType_t; -typedef enum tagModelStreamType { - RT_MODEL_HEAD_STREAM = 0, - RT_MODEL_WAIT_ACTIVE_STREAM = 1 +typedef enum tagModelStreamType { + RT_MODEL_HEAD_STREAM = 0, + RT_MODEL_WAIT_ACTIVE_STREAM = 1 } rtModelStreamType_t; +typedef enum tagModelQueueFlag { + RT_MODEL_INPUT_QUEUE = 0, + RT_MODEL_OUTPUT_QUEUE = 1 +} rtModelQueueFlag_t; + #define EXECUTOR_NONE ((uint32_t)0x0) #define EXECUTOR_TS ((uint32_t)0x01) #define EXECUTOR_AICPU ((uint32_t)0x02) @@ -83,6 +89,11 @@ typedef struct tagModelStreamInfo { uint32_t streamFlag; } rtModelStreamInfo_t; +typedef struct tagModelQueueInfo { + uint32_t queueID; + uint32_t flag; +} rtModelQueueInfo_t; + typedef struct tagAicpuModelInfo { uint32_t moduleID; uint32_t tsId; @@ -90,6 +101,8 @@ typedef struct tagAicpuModelInfo { uint16_t aicpuTaskNum; uint64_t streamInfoPtr; uint64_t aicpuTaskPtr; + uint16_t queueSize; + uint64_t queueInfoPtr; } rtAicpuModelInfo_t; typedef struct tagKernelTaskInfo { @@ -189,6 +202,12 @@ typedef struct tagrtRdmaSendTaskInfo { uint32_t reserved[8]; } rtRdmaSendTaskInfo_t; +typedef struct tagrtRdmaDbSendTaskInfo { + uint64_t dbInfo; + uint32_t dbIndex; + uint32_t reserved[7]; // offset 7 +} rtRdmaDbSendTaskInfo_t; + typedef struct tagrtModelEndGraphTaskInfo { uint32_t modelId; uint32_t executorFlag; @@ -212,6 +231,7 @@ typedef struct tagTaskInfo { rtNotifyTaskInfo_t notifyTask; rtReduceAsyncTaskInfo_t reduceAsyncTask; rtRdmaSendTaskInfo_t rdmaSendTask; + rtRdmaDbSendTaskInfo_t rdmaDbSendTask; rtModelEndGraphTaskInfo_t modelEndGraphTask; rtStreamSwitchNTaskInfo_t streamSwitchNTask; uint32_t reserved[10]; @@ -307,6 +327,17 @@ RTS_API rtError_t rtModelGetTaskId(rtModel_t model, uint32_t *taskid); */ RTS_API rtError_t rtEndGraph(rtModel_t model, rtStream_t stream); +/** + * @ingroup rt_model + * @brief add a end graph task with flag to stream + * @param [in] model model to execute + * @param [in] end graph stream + * @param [in] flags AICPU datadump + * @return RT_ERROR_NONE for ok + * @return RT_ERROR_INVALID_VALUE for error input handle + */ +RTS_API rtError_t rtEndGraphEx(rtModel_t model, rtStream_t stream, uint32_t flags); + /** * @ingroup rt_model * @brief add a end graph task to stream @@ -326,6 +357,27 @@ RTS_API rtError_t rtModelExecutorSet(rtModel_t model, uint8_t flags); */ RTS_API rtError_t rtModelAbort(rtModel_t model); +/** + * @ingroup rt_model + * @brief bind queue + * @param [in] model model to bind + * @param [in] queueId queueId to bind + * @param [in] flag + * @return RT_ERROR_NONE for ok + * @return RT_ERROR_INVALID_VALUE for error input handle + */ +RTS_API rtError_t rtModelBindQueue(rtModel_t model, uint32_t queueId, rtModelQueueFlag_t flag); + +/** + * @ingroup rt_model + * @brief get model id + * @param [in] model + * @param [out] modelId model id + * @return RT_ERROR_NONE for ok + * @return RT_ERROR_INVALID_VALUE for error input handle + */ +RTS_API rtError_t rtModelGetId(rtModel_t model, uint32_t *modelId); + #ifdef __cplusplus } #endif diff --git a/third_party/fwkacllib/inc/runtime/stream.h b/third_party/fwkacllib/inc/runtime/stream.h index 0b5ce843..83bb4b63 100644 --- a/third_party/fwkacllib/inc/runtime/stream.h +++ b/third_party/fwkacllib/inc/runtime/stream.h @@ -22,7 +22,7 @@ #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /** * @ingroup stream_flags diff --git a/third_party/fwkacllib/inc/tdt/tsd_client.h b/third_party/fwkacllib/inc/tdt/tsd_client.h index a50f9c6b..a92939a3 100644 --- a/third_party/fwkacllib/inc/tdt/tsd_client.h +++ b/third_party/fwkacllib/inc/tdt/tsd_client.h @@ -27,6 +27,48 @@ extern "C" { #endif // __cplusplus +/** +* @ingroup Open +* @brief Used for the Framework process to communicate with the TSDDaemon process, +* and notify TSD to complete the initialization of other processes +* +* @par Function +* Used for the Framework process to communicate with the TSDDaemon process, +* and notify TSD to complete the initialization of other processes +* +* @param phyDeviceId [IN] type #unsigned int. Physical device ID +* @param rankSize [IN] type #unsigned int. The rankSize of the training. +* The default value is 1. When rankSize is greater than 1, +* HCCP will be pulled to perform set communication related operations. +* @retval TDT_OK Success +* @retval OtherValues Failure +* +* @par Dependency +* @li libtsdclient.so: Library to which the interface belongs. +* @li tsd_client.h: Header file where the interface declaration is located. +* @li data_common.h: Header file where 'TDT_StatusT' defined +*/ +TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t rankSize); + +/** +* @ingroup Close +* @brief notify TSDClient close resource +* +* @par Function +* notify TSDClient close resource +* +* @param NA +* @retval TDT_OK Success +* @retval OtherValues Failure +* +* @par Dependency +* @li libtsdclient.so: Library to which the interface belongs. +* @li tsd_client.h: Header file where the interface declaration is located. +* @li data_common.h: Header file where 'TDT_StatusT' defined +*/ +TDT_StatusT TsdClose(const uint32_t phyDeviceId); + + namespace tdt { /** * @ingroup RANK_SIZE_DEFAULT_VALUE。 diff --git a/third_party/fwkacllib/inc/toolchain/slog.h b/third_party/fwkacllib/inc/toolchain/slog.h index 01636253..1fb9aff2 100644 --- a/third_party/fwkacllib/inc/toolchain/slog.h +++ b/third_party/fwkacllib/inc/toolchain/slog.h @@ -21,6 +21,14 @@ extern "C" { #endif // __cplusplus +#ifndef LINUX +#define LINUX 0 +#endif // LINUX + +#ifndef OS_TYPE +#define OS_TYPE 0 +#endif // OS_TYPE + /** * @ingroup slog * @@ -109,7 +117,11 @@ enum { DVPP, /**< DVPP */ RUNTIME, /**< Runtime */ CCE, /**< CCE */ - HDC, /**< HDC */ +#if (OS_TYPE == LINUX) + HDC, /**< HDC */ +#else + HDCL, +#endif // OS_TYPE DRV, /**< Driver */ MDCFUSION, /**< Mdc fusion */ MDCLOCATION, /**< Mdc location */ @@ -150,8 +162,12 @@ enum { CAMERA, ASCENDCL, TEEOS, + ISP, SIS, HSM, + DSS, + PROCMGR, // Process Manager, Base Platform + BBOX, INVLID_MOUDLE_ID }; @@ -174,7 +190,11 @@ static DCODE g_moduleIdName[] = {SET_MOUDLE_ID_MAP_NAME(SLOG), SET_MOUDLE_ID_MAP_NAME(DVPP), SET_MOUDLE_ID_MAP_NAME(RUNTIME), SET_MOUDLE_ID_MAP_NAME(CCE), +#if (OS_TYPE == LINUX) SET_MOUDLE_ID_MAP_NAME(HDC), +#else + SET_MOUDLE_ID_MAP_NAME(HDCL), +#endif // OS_TYPE SET_MOUDLE_ID_MAP_NAME(DRV), SET_MOUDLE_ID_MAP_NAME(MDCFUSION), SET_MOUDLE_ID_MAP_NAME(MDCLOCATION), @@ -215,11 +235,16 @@ static DCODE g_moduleIdName[] = {SET_MOUDLE_ID_MAP_NAME(SLOG), SET_MOUDLE_ID_MAP_NAME(CAMERA), SET_MOUDLE_ID_MAP_NAME(ASCENDCL), SET_MOUDLE_ID_MAP_NAME(TEEOS), + SET_MOUDLE_ID_MAP_NAME(ISP), SET_MOUDLE_ID_MAP_NAME(SIS), SET_MOUDLE_ID_MAP_NAME(HSM), - {NULL, -1}}; -#endif // SET_MOUDLE_ID_MAP_NAME + SET_MOUDLE_ID_MAP_NAME(DSS), + SET_MOUDLE_ID_MAP_NAME(PROCMGR), + SET_MOUDLE_ID_MAP_NAME(BBOX), + { NULL, -1 }}; +#endif // MODULE_ID_NAME +#if (OS_TYPE == LINUX) /** * @ingroup slog * @brief External log interface, which called by modules @@ -236,6 +261,16 @@ extern void dlog_init(void); */ extern int dlog_getlevel(int moduleId, int *enableEvent); +/** +* @ingroup slog +* @brief CheckLogLevel: check module level enable or not +* +* @param [in]moduleId: module id, eg: CCE +* @param [in]logLevel: eg: DLOG_EVENT/DLOG_ERROR/DLOG_WARN/DLOG_INFO/DLOG_DEBUG +* @return: 1:enable, 0:disable +*/ +extern int CheckLogLevel(int moduleId, int logLevel); + /** * @ingroup slog * @brief dlog_error: print error log @@ -338,6 +373,7 @@ extern int dlog_getlevel(int moduleId, int *enableEvent); DlogWithKVInner(moduleId, level, pstKVArray, kvNum, "[%s:%d]" fmt, __FILE__, __LINE__, ##__VA_ARGS__); \ } while (0) + /** * @ingroup slog * @brief Internal log interface, other modules are not allowed to call this interface @@ -350,6 +386,11 @@ void DlogEventInner(int moduleId, const char *fmt, ...); void DlogInner(int moduleId, int level, const char *fmt, ...); void DlogWithKVInner(int moduleId, int level, KeyValue *pstKVArray, int kvNum, const char *fmt, ...); +#else +_declspec(dllexport) void dlog_init(void); +_declspec(dllexport) int dlog_getlevel(int moduleId, int *enableEvent); +#endif // OS_TYPE + #ifdef __cplusplus } #endif // __cplusplus diff --git a/third_party/fwkacllib/version.info b/third_party/fwkacllib/version.info index f9a8f04c..0e65dd04 100644 --- a/third_party/fwkacllib/version.info +++ b/third_party/fwkacllib/version.info @@ -1 +1 @@ -Version=1.60.T0.0.B888 +Version=1.60.T49.0.B201 diff --git a/third_party/prebuild/x86_64/libslog.so b/third_party/prebuild/x86_64/libslog.so index 23efeb3f5d58de362e3d560c6132c8f7861ae97e..b476618dc715be24fd75bd5c89a168b0dd9c60a1 100755 GIT binary patch literal 89440 zcmd4434BvU_cwkEtrP)MkZQ#(U{FAT7Ok>HT4*7~B4ra0u`O*1mD1KWVR3w4XER?rkd&{F%-2WeGtX(103Y)lFGwxt2UlS% zuH%0bK=j#0H}NsglGG+3AC>oSKM6A7M6UX4rNNApg7Bp=$;BRq!jN?5V*Wf6^ zL7z3^6#jk*SLry`h7nC0jq7`G^v98hBOAw;INrta8IDRE^m$C2wBg6yn&0V-$`Lam>JxiGw~t{uWX1BV~S1vuv7Fh6~8aXyZYI3C3D6pk7k|G`0@u{gHi zxC=)o9IxW|8V7xz!7&(z6US~G=I3(rqLp)diN&;XuX8Jy3bj)S<@z*SpQ6yMT~drV zxs@C7tziEK>Y*e$71tNz=!@e-9KCU4tN$?JN&cL}sUJu0iR(Y+SW`ej<0cz#-!}7dXo+sdZ365kO z4jgylI0whQIHu#EPce>b`AU2}Ag`-&cFF7c^1KA+GjI%&^rQ0ph&(@y^OHE-IFfMO zk7FwiD~@Mzq~S=x(H)2RDU%nUnt(jd!+Dy#z8mN32q!o=}mw9puND$g95)uoV2TvzN@@ePr-Jr^IstO z)ROY>*^6}aS%6?hayG0JbhbEYA4z%YbOHP(@rMwMD1F`x0Z?3{&nc3>RT}$w0c-Wl zUlPh`KpT2^OzN{y>eFmP=SsZcMZx3?$ty|fxo))p21xu@mQxbSX{waJTso+!&qj1k zt=YwbK;9a@3U~rM?ZA0G74GP7+o)F=_LoU}=8=;zNijhxyEvF6n@0R>)-xldFmV6$Ud>W$o+};MC zY0O7vR!$pm-I|{7YePQ;ZQwt$|HC-4;hSx|Sn8op+I^ucyS5Fx|0((8Nj|S2KYc!v ze5y>xl={C_wwE+%-^n7mHc;}Z$``#>iCs_vCZ)JLFf4$_jdfEx1HT^7YLl0%L z+yk!)E~dY|&_?S$tXH{Iys768xm?7_4I=+a z$?_qN8^`nikCeZ$NZcC;zVx|G;(5}~ZxFC{qz!r893NyN<@BbMCr!#@+AF>d{fv|C zu0{^_-h&+UxlsCl?Pfun@pCo%73xc!3qIy}w319C?{UqNb5QnUny$yZtYlHt{ z_8*gL%Q->HnJoR*>=!yqyVnIp+CrISP#gWip*H+Evkf~=ZNqO@nR=7`tSL{m^tZHn zA&A+oU*!6tf06BCg_Lsu`-3EuQ;xKIuu^~@Nqu&0L!OIedaF!-pXBwI^tU=$pg9)( zL8i~UUL>3-?Oq46wQ~E;oY5_D${BfMguXq56Ytx^9XIv&yPOr70 zf2Z`Tnp;KU9#YP+QlB|53EW&J>TSxoS>UGrhk_5q&xF1``Gy%kSF^u$(h_LEiO&yG z&s`+{x)%g~w!}xv^yLKtzXp1w&-^z0+>3mqhu|9`of&`LY9oEVEI02hk>1qX#f%F~ z{a(iMsED{ewGBThZ^PeiK{k>z;aQRY38`n7)LT&YJ1{qYE|Br0{7pfBFY#AReCcdUn_ zaEa$g`4cJxK>aR#V%yMvJLV(OsNciesPAjC-IdD;&37`>&8!ENgSZ~v2LD5C;J>xu z551+H&Gpp{vZx<$X1mMzh1t)3A>}+E6=t@hE2Z5V)RLgmK z8})jQ%WX+#UiE3iA2zkoUfz}Qvu>;4FhDZuAn}|{0x;v)7c5UFt!jS?x3`IIBDa+hx*gx zsD9xx_QOtEvTP{kxTvWO`))M(FB6Q-`rabtu?7Wh%9$kXTl=`c-w`K%R>5&Bs$Qou zADKuw{cM)&6QEfxb!V;hJDDthC+)yyk=~T&nAB&D>_=!_gFdH8JJviP=tUB5mUcO? zO5lHqlQth`lD|>5yHtsvg0od#t0xc3uOj05QCV*JlY*ATYZtPd$EV*9h{_GF75B_~ zy(RrNO~!e%U)Un`Z%}6S!r&i zE5E{(o2%s(6_*xkxmlCPd#&HeEm)XO4Dw5g zuXAdXO3G$WDJ>~0n5#`LD=Qi2b~zVLDa|(NF-})zMMYV~sIo$*mQzt&>YC&%DatOJ z4Ur&C!Q9+}IdgN1@{3D2JCf%V7vf%dMcM4q{CQ3-zo5WbS*cZ)mQ&`U+)7S2zT8r*Rg^eO8J}8F>~iuI>Qd>{3P`Zr*-n?M7`$fZmgHBuCePQhDklMVx<-|i zmO2Yu&O&XH(-j3xa^<_+mEokL3$H+h3qeEptWp$1*HnA1vP6hYGz2=V4o=HdxWLg| zIGB%IbU9TRwJDfau8l4(r4l75DQ!|Qv{I5@;3_UF)h0Ug3!wmai4!7aJLfw~Mpu;0 zL!F9>XKVAE^D3P#sEhcOFTw?MOZULk?Hsu%8#YG;E=MIR@=Gpv&zt0`ptP>C5@y^A z%Sq1CLRe?yqH&e8C%Bz%DAeh4vrbGTb0M)ERpHEcIZceXmI@Idl(-;P%mRtDQFELH zbHNzu1gwpAmz1y_S>(xNIM;Qki|HVQj4#T_U&H{EnO{QCMAU|}p^}wlaA64NES;~J z6(I|0^hHYG+Ovz`c2+68A8IA-<`kDZ9r>k&CC&;hy|NPOo>X2^T;Lp;UrAbqxak$M zH7*C_7rRslOsokZOOh_rD#(nB~vPqwvZH1z5tC#8(raaw$yq~ z*@9A&t!OHQXoV;#x0sZan_E&mt01?)T~XmIg$otWu5isOE@kCGl=RX<77dr|bg*w> znOlf8x)?373>=CULK14{pJtkR2M3E7k~AP9$B>gk`Ob~QuHm+RmpUd5CJDT3+9KTqr@i`%-70Ya}ZezbLUYf zE81!##0raq+@f!e1Pi5qarkh$CNo>^eb_WG!lZ&Z&cbvTEMn%&2a${TR$MyAiC(sn z5*&Av(eg4x%tdY7%dbE{GD{H!w^|IsN5R}m8ev2WQ0T<1ITT)GPtU#0S`yArJ^ ze8qU>qEdt+@hze?g=N{jj>M=#n?+xgmf9=4wnUr=A?XpoX;6j}0vyM1y2DN^Lp{;r zk~{QD>eX8?@^0uyDsNKyBpQ^u%SDRxQ6zH#gr^R8}#UQlO(NIRAV~a50hd!dJ5k3UVunN@1n~ry(qy9qn|!6fU=N zP8r7E9KCX#h>H0rXI5pU7?*3gq$er{-3wS~W3sbGj>;W0V9>GOh)*AOFm3P@QDj^5&2CXA`j&grHI^u z5(wQ>%WIKGltKLI>I9@Sxrj7seoE1SInkB*ucdUX&nkBbkH~ zd`IHM)8s)i5ua$OTJEcP&1*`nrXV?0gm|{-Txh~9fp`d{7q3xiJU?~!pp}~T+UvmR zLz*cK2;}VaC`&dr#68r&0bBw9h4dGN+HzzLT`zAItpd8HtP2 zx6}6FK7Gu6vRL-xPr?*&PBQn=O1!~bFtGMy#7wVA+8Tuq_*P_0j>0dLcv=)bMB+J7 z_y~#TMd29|FOR|<60eHFCrP{}3ZEwN+9*6v;&oAYp~Qnx_#BBhMByb8Z;Zm{OZ-3- zev`zto{@U4k$6HBez(M}QTW3WPmaQ$lXzMb{;I@tqVRPR&x^u0OT0V^e^267QFw#I zYohSaBwibZe2cz(P5^spY4@tZ+3O_3G15tRh#I;_L`j7oi_;W%O-bLcp zDEw53Cr9BuB%T(9pC$2}D7>e{^P=!R5-*R!2S~gs3Qv)EO%$Fg@!BYSu*B=4@L>`U zM&UC}{YT-K%Jhv<_(+K#h{8upTstRH|5*}Gh{7jG+!}>XlX!9zo-gsVD15fWbE5FM z63>gm%OqYNg;z?vDhgj9@tP=niNu3ZxJTj*QTTF+H%8&NO8h_+ez(LmTckc8lz2iE z{)oh_QTS65PmaQ0ka%7cZpQ!eDBO(yRZ+MZ|7)UfGyc~`;b#1=i^9$LAB@7y_}>tP zoAJLf3OD2bfhgRJ|61=zeVXwMxEcQsMB!%q*UpX9zZw4% zqHr_*TcdC@{wGJ_X8cc!!p-=f6NQ`cKQ9V5<9~S+ZpQzrDBO(yHBq=3|7)XgGyc~_ z;b!~~M&V}sZ-~Op_}>_XoALiZ6mG_UP42@r$AM=2Pl&?J_-~EE&G?@jg`4p|Eebc| ze@+x`#{ax1+>HO_QMeiZtD!p-9 zm?0l6^MJ#u;PO#1PR-9U1#f49 zIM*n6dj-Eo!D(h^ergpQk7C20rxcuQV}9xs+-x&+Z?%GBA{YJy6})p8#r8V|$HXxF zX;AQ!!l3lzLY!TT%tJqkWR!D|&fMZup^aP!$Z z-L6yc3zhV%6`U8jc!PouR`4APK2*UQ6?~Y28wx&L!4D|7xfh6T zA5rj2mGs&fE&iXT;PDEcuHXp@K1#up6g)%0tqML`!TTuq7zIyOaEF2qR`9V3o~GdD zo)Eh2Q1J0e`WywHpy1OKe4>KqDflD>pQGUBk`vu7S8%yR#km$L_!SDDDg`(9a?tH% z3O-FqU!&mWoNkAh#Rq_0(Qb8jTweoDcwR?^ogc&>u4R`5Ip4=Q-Rf^Svu0tIhS z@FE4@q2RL>yivjDD7c~E#R`5v!LL#9BMLrO!L>76{J&Je;}yJI!4ni*Zjs@;NeW(} zq_-+~rGocSaF>E7E4W+12P^n|1y57(1q$v^@P!JVqu`4ae42vasNi`DzF5KMD7f5u z%XiBae5sOtp@QF};8hClQSfC7?p5#_1;1Ir?@{nu6uef!mnryD3a%@7oq{h{@YM?L zQ}CdIuTb!<3cgao8x;Ij1>d3Iw<&m|g0E6=L&5J*@B<2dr-C0*@VgXTJFCV2?^f`5 z1;0nZ6BPVj1y54&`xV@(;O3q{y4^>?A5zjMD|oGf4_5Gp6+BJBA5n0JflFMg1z)Y;a%(2v4J!CLCH+K4-~wQf`6#s$qL?};DZ(X zBLz=W@a+ojQ1FixJV(JlQSfOB{+WX3Dfs6KK1acKD0sPof2H6H75r-juTt=D6nvS2 z?^N&_1#eXFdlY=Pf;+1B#yfm*`#!g74sXyEXH=eu!D02L_>Ce_bLy4bty**MtMF|( zI}KMv^GGAq*o;r_iA2XBb*O>qu|(69JQQSlB+>1Ou48&A(Xm9=GChcBntF$7nC?gP z2}D;h-HT{)zfd{TJ&2AcI*;jYM0X@Qhv^fEMkTaR8q@8F?nHDl(?>r7-I-`B(+7#B zDR?M>={-cBM6|~Aw?xyFJ9OX}fZkscoj`OW)7y#eN^}F$+lcN)bdc!{M4wD_9n))w z?oMFbC-gJ>($6-1v&bOO`Y5PcTW8q>3gKAY$Re^LFfB-%=JBhwR!?n!h5(_@M5 zMRbtqkwl+EbRE+}iMA14%k&_kX{sHnVY(mD=Mr7TbT6XMBf6aF9z^#cI*;jYME4~+ zhv^fE?niVQ)9r{ppXg+!k2Zk5fM_ez2Z^Ssawvi6Jwy*6T4VZKq6ZRv;7_(c(aA(N zGQFMX6rvlL-bVBwqJvCtAo@b0>zH0c^hHG1GW{~qsYKT>{S46;6J5phBSc?9bUD-a z5j~jbJf?3adI-@uOfM&zrpTc*rf(v87}3d0Uq|$CqOD9<5N#(qf$3|AzLaQ<=~+aN zAo{=`Y=5HDh;C$hBGKtYH!wYx=#fMRnI1{>D5C3_9!hiu(X~tuB07`k8m9XZJ(}n$ zrh5@RhUjvpdl2m)I*;jYL}wA5!}N(nk0m;d>2^e4MszaMM?VCeO|+HigG7%bI)Uju zL|;y{#`L#Dk0<)TQMNzPIYc)yy`AU@L^m+Kjp&I)2btbL^dzF|m|jElWTI=CewpYg zMAtC=4AEB*UB&bxL{BBUoay_Bo+)x_RHxWI9=wzm^Bl;?$ ztxQ)CJ(K7JrmrFTYN9o!XAzxC^nu^m{zT^y-N^JrqVtJvV0tXkvxp8dJ(B1GqU)F* zN^~L7wM-8p+DUW`)BT7pBD#v{UPR9(x}51AL{kS8%4514(ZxjPFnuD?*ASh?bUUKw z5}nNS(GNhE5N&1pAkp)PPGEWu(WOLdOn*ys8PNxhu>FZHC%Td8?L=QobOY1dh^`_+zMkl0rmrLV2BNJ@R}g(8(FsgnL-b;zHKu0~y@cokhuQu_R}tOF^hBbU z65YV`SfXztI>_`$qN|CnV|pmj9-?cR9z?X4=o+T`5q&e!RZRCH`WB+gneIXKGNSXC z?nbmubPm%e61|-0G^X1T?ISvw>7(z1UO}{#>4QZ3iB4d8578@$)|md5=v#?CaER?s zbPdssOm8RpHliDt-bVE8LzH0c^c_UkGW{~qcM@I0^fN@?MRXO@j}Uz~ z(dA6vNAx{J=P`Xd(f1Oa!}M~Z?;|>m>6?hYpXg+!uOs>aqOD9<5d9$02~1x@^nZxf zn4U%SLqs3=jqOi#EzylkPbB(bq8peVOY|c|2bms8^rJ-Ah3TT2=H7(UnbOml$}Or% zpFCv}#-z!G4qt3<>g64JhOJLZ(4l{4Q2Z*g>{U|i_9P!!`Hg+XufjG5_I zZA{lUr38(WDG3dZ{fRYbeGY%>!*G%uhu9b$ zz=+1-6Xl1iULRNaiIpF&xraKIZX#i4%-o37G#*0rZ$mIb^(UfW)V@8Gk*){R^-$n+ z)E&um6xOudcp0an8l-soW2-jB;g5S5XUC8u4*dt$X%PRWcR^x`(Pd_HV~dVvZX_+W z(8ym*`TVJ~m`&ef$nRKxti3}&;8<^raKvnJe00o}1cvF7VP|4!>O7iXmAxFn^?yCJ*bMimVQ+9`rLU-}j4Q`%}Hh6MqO8i>9b>S>4CLRbevU&^6~2I24} zi8iqx?h1>XUmw>MiD|SS*(faDYayN4Fw$qHU%fF){~pcnSHxE|zdvzGAy?q$$Bi#r z|H)YRD>t=teY3^$7Id+u3S8Evhxns*hOU%^CXP zY|CgPL*HYZj@oAoF2Bqt5#HK=}I4?1$$%e5rFcQ&sHQbkYwRXFvdNP~Ye-U5B8c z87uZv=H?;hl=(0n%AU-+uN!(9OrBOFTk;Oy6;mB1wqmT{G;MU@J{pJKsj zV1_jNz^&<3%h`0DF4Auh=}#BwFXHqRivx+ACFLt)H8O>6fXf>@aTV1+!QQ(4;3jKi z*deIebIY_5T6tKDp^!$nU7$nBW@f z@C~DcgB<$L#<#En+WL0S(n4CD^o!&QRF;QdZb?Fg`SYA=H7^ zel{v^towz=AWg3uKjK2fFCL8@*Fs2$DGu);hyJUpKTCTliJH)4WjT9DIlGaZL#3R) zQSnVD)JRDqI=Ty~@nXm!re8X4Y8O()7!=l5qS@}dONBw)m)bjTO9B1-jrneb@gD))o;kcco(|YV1j#pl-5orSvI!BC#d5a zD5P)arvEV2Ch;po)G*USA#5G8ck< zAVYffpD`BigSe`XTe(3D#9GRa^eT(@U8GYC%WRhU1e;jPt$V@Q;qQOf7OR$__Z|aw zDPM8Kh9xkPTnwer7_6JaHyEBU5cf0mAB`vXpu^r6Q~k$a_vcN1kv^jRTfCE~Tz{gs zgifg^z&3&I=paZGIML65Q-b7dq3hA0VVCtsfTx}WIWRKqAc{qO(Wt8ah_&3(8yAt9 zw|Ebb?2rpAArvt~fh*QqwNMAhC+$}scMTBwgm%*iJxa{ya3d4__@6@D&*rkYKf#AW z6Hp|^rx6+4OGE_{pBD*NBB3bQembfY*bg=-!O)WkXHgkPQz>0Z1OCKvkvTx6G#w|D z`*ceBhY%=JB)x)?2C1vR0jwk9!_@EyZ|Pn~Y(yXLu?+^0g85IOE!pU`4}%SwI$&h$ z3|cjfMt*@RC;+3ZEj0i6)$we2{$ueNxa07jISffL1UARZm}}5%Vl92q=xndS^N*fL zyq3Nk`u=oS@KeAJ-^pvOCbY%q6~e$wKMKdn=y28;J4-1D8{sNNFmzp44yv15m~ z_iX50g^w$wZvdb5dlH_ms=~A=V}rwgMLT`H!;*1OTc53e>!{w@KG57ArS<*U*al}o zTpK{roP~zx3A%cv9HQU3V*&qw!m5wNX6f~oo0k9!x9g1wj_P5W#oLcvOXw;*ez16H zMvK`Esgx{CgW2@ljWslqOD&tnL%YN$v5W+7wq66%Pl+K_XX?LXV&H+`Ydo?SDnJR- zueNx%z;sRb$#@llR`e~Xppp9%iEio5?RUTVB05N0i*97MJPne~$u?hTgZD-UD=7#X4PzvkPSPJE# ztKIBQph@J%(3=Ocor6;0vv19zQ>qhkA=3bsiYK%*R!3*~DoRm(gIF~DC>Tm*`t268 zEE1s;L@>I@GF{_Q#xzmJQc=b>P;%0It0*IfYJ%A))lIR~r@~UC0x zV9n2dj9WR5p*Ah?omzX~QpgR%>wit6HjD)W`ZekZ^RET~OZ#uI^~v@>XG;c&NdQ@$ zX9Edz0s9B&B3s{Y>6&4~9M5C3A|d&OFYebjty;Fvig~&7Mod1u-)CDg8ja`Kr0$c9 z?p(<(7SDPJP33epuH}0c?`}pOU|wjf0Sn4lO9hc~-eQ-BGg`a{IbD}dV9)a@ERU?A z7P__yUl59>E%bAtnlbeUIYHcF@%#lOTMzlVI;z*V7C0!_B}9ad3GK2VUp$(eC1YEf zwk2ENY)Hh_jgIMJa(gf4{H9`LVA3EU|BTc^;BJ64L)R4fK-5~S^8>tAY|JEUwRxor(f-ZS^~I+_V(bRJ> zbEiow{C5+w8?oS?z82m~3sLt#oUaO5oD9S=WEh7Gm>ScH)srlaNNcjpJzpF?x2kQQ5Y2g$eb~xYGX`a?~}v z`p7YhXCsVLeYDxN*!Z1&z?B&}La=z>Wb& zIkBe@{dS@HRiMZjVY%;7HRE)&O1*zRglSrdah8}C|2&Iwrsj$)dC0;oFtC$ygdiU} zr->}4#XAK?YkE8EUlH=?_Re5z@nj;6u^Sm`(p$zO)q5REOvZOUC*OlLCkSX=ciyrj()@i|0|GzAl}ER?Yj4#p{7YSq1fun2nAhhUJzU;q#d> z#~gjJRCFz_)AhZG63C%pVEQeu1noDHP-P54;3$jX<(Xa(8y7u+kZkds1$Na(I=MSk z-(X8Aa%Sj9EZ%+X**p%jU1G=}{0I^C;ny*G_QeeX!)nMJ?ta&y66k1guhm6X7Dq@?h7l-#0g3otxn!Zobq%UbAYvP($RT=!Nefzf1RsW#ScPkESvdu%an-@H@+WrEBb_X_ zo(K`D?K_b(#^Sk%TVQ?Mdqg6MaqG9D8fc5CNdR#AHp@;$=K-pKI#u$lBq(M|-b1oDzc?!&R(S)2p zEefEjrqOWi3*#sYu_>AR!j$ddo~ql<=H~jix8yJ(6wkT<&%>cuzVtvus$2}jZfEgK z#maC>us&}8s~~a1czJi&M!SHaj;P#~xCMbqArNH!iAMtVMo=*7wGaeDij%;orN4)V zoCXnm_EYIQHIcr%oQ!W$$ORcbWF*(H>NnWV&|S79G_UyjxNR7BkcO$D^{J1Wf%4&) znsJtt3Bo|7Xv)dyIq0OoCvF(hO!c{J$#9_Y5agd6pxv2*Q&$)m;r% z8;OL@kw0hht zNP)(36;7y261zQI(oxU69QA4ObOE)rqwH8SN3G8=G8@4i7Z^$LiY`fSh4ho4f#q}!~@1P=9)eVf3xFV}@i$0-BEJbIgG>6(_qXVXS zL>f)VuB=beYJ0l=8 zyT|X}WKH+qi22vg4t=ZR@E-bJe`JJX{m~ANz8f6H8w?6ueb-NbeY@nrEy*_=G+W!1 zgUjLc<)E+0IT>J^C%mW7cpKu7`SGL-Yjktq#hqkmf|{3IMUthagB|tFzk{S!z>hI5 zLV3m#R3F2S6!c5T`4Kflf8s=uYX&TbOq)1UkVX>pQ;JA7mUIS5riG^f?rR{z91(Ic z5UL_2f>B4oXb%sC?AL<|^b{;QfJM`r=J;;9DEG@5EbTd>-0YTe&lj1tBOWx}Z2WFy4}lOS>$Dpqw}jApqb*z~n=4~j7R zTk&xBt|>xOy`VRGsPjc&CfW4a zZ^)+6W1H_t7#_CJR9amxYneAqKt*__zR7aSsAiJUzV<~#fz5i{3pmpnVm`X+XKxq6s^p*PkKPuxLG!2P3<5|%vmEHrd@#Np|XJloeE3N1lh0uv!HJPS_s z8`4g{y6L6p^_~GFG)AS!yat(tkD>0b6%s@Hxx{5M`$y9Vc{1MPcN=TvAlMT0Qve0R?YYgx!6or*;P{$UWT>z8y(?6aOgGEfw@lS zmB-ZAo`({sJ?R^095*KM0RSn%As^o`$QRd$9vBTE!)QiG#0aGGk|8OqI_cr%EPkrX3-w}iCrVoRfEE$?=wni5h?FCE*2u9zC2#nKNQ{8C)92I zh2i+pz0;rvrQh<{=nsLwslG<*a(MUAY?)UGyWBBf(Yao9K++2pU&=j`mMCg-a?DQ+R zX5V*rCZt6!?%0fHk!SkVv`j@BYw4qA zVa85MRb~d{m!ukq^(mSWjmK#F$OIwRbzGEvBNe5u4;(@vNSF=wO%J!k7eIYK#!$Tj z#f4nlWG`sBZ_dZeF$qjd+e9Kvy9@x$|Gqblp-J-q2=>KrA|)u+f_{1$8uwu|7&o03 zt-Sj%TQM+hv3UHb5ig^9=m(iI@XQ92n8nxmuJ4SHlSYwi7~;{|DwF_zScEm2Py{R_ z`=^4+_;Fs;d=z$A49oCcmf*jl*-^0F;^_=_q~R_$TEf7pSMw#P>gA{^T7(trM|cd< zZ1K!PGSOuF%6S(2e=^x9wV%*mH7M9KDVF^Ib80CL-yOUu2Da(Wiz*iH+fPh)x#I#gs?LpY4NshMA&g(G7N&E~_xXM&CGj$~d1T z{R-pO?UCYp(_uB=jLxXpCr}e*f~nEnObT2Kc~R}q(-4XLgmp0h>_`_m$S|sf7wr+nuu999z zdv2t(Qv!XEJpBqVyF)PhO)xuwnHhJ3519>RF99zkn(Hd**eDG zCsF+I?n5K&jP(TA4L-ec2+mmVw0IhzpL80qW%4T)KvYdkV#^<;N$!&NfHEk-l3TG5fRigAm5}6Zhnnr0PGK-OqvjkQQ<1v3D+H zfLPCBu7Wnv8Dc@AP$@i_^np9Ki965mn9JUtWu%PRWZ)HO!Egh5xMucO`6B&9o)g- z%wN41QyyjKj02VPHn%*n>l>andM3GMJ|Bc5Fm zR{tRx9lq~vfXrFgKLF)UsSSW@>r5YED}kp60uFDF5t zqWRLgrwcDW2>oWl!7_ZA+%mu8MeR39LgS}**_7CtWG}{S&p7;ntdA!dt^)_(%ytSDM2AWwSO=O!ri7{9r|4aKM!h#y1Yp~NV_X%wQwCMxE_II)E;U< zg84Ws`98t|8s{=Y!TcI9z_QY#6eTYOPOY?Me9-Fr1pCHNuVz|Vr8CDTG?jBACjwX+ z7N)^WC2T*#$NuI1gq9>?(fEwzFa|PjM8k>oU1Wp22eMJGjSx$C6SdQpo1-xCHlC0- zU4B$NfHr}(P}fpl+*!|HT{fqA>Cr97W4SqiZ*;|X1TC)z^|%<^MDpcGK8Hh}Kk)>V zl!Gbj&th&rUYN2RsF<;ChPsU3As`fVHfmZ#`}eX$c^1!XAilU4o>t17KeeU2J8=Wa zdmuTAI2}x7v8g|y4wxY)-f|gyh89^d-OA<#X+O$>iMxO^R(AEcao`bp9HwDv6j8sB zq3%RF_Cw64e$)SeHyt;NFk;Ydyp#9u(JWsqQ0oou4roX; z)iF{{{xm)ur`L=A@&lCi7i}Y2x;H*9D!3kD&vN)5upNNbc{$qn5Vj6ffy59TD@8n9 zG{!<7u=fBO0#hcGA1jAh^hZGu7>T#iCviQY$P~04BPhJ@wsLmJ&3x+J3qoIYpHMpf+vmz{bl91jPcl2@DZa?TivEnT7Y-zjS;3i7A z8g-J9`l~T)sLn9doCf4ToN_qGv+|Kdb7W{S8%wA zV~xENG6!Pd>BhatWMokC(4W+*t55;WSOqKNX#@S;Ij$00b0@q4)#9DdD^Z}spKVJM z&z09eCShT{9)UScLtTv*#87A1b!?Li*kn8HD$>@{7q17>gxaxLxFkPW6+_3weDp}^ zyu`A$1Zj&hu+X08@GopHya-IgKrfu#}X zqrI%!d7^47V8PG?C=Zz$p-3~+RU%U+4bJ`c4?vMVhKo#7DHCbj7uN^1;1K||>iJi; zc%20|z;gw73a?8Iu|X2sftT5$=S#lySK7H*8-JoE=#cxvlUPfe9)_0AHmNm4bp>U_ zLIkkyuZaBgD6Z!kUaqm9E?HZJ$d8eKW4m3DpX%?9 z(>H}R@qeMe*=O<8LtI+Py%lGyw`XL%Vz8GMQoFAzXx$|!6rbZ+;=cO&{Odj5Z zYW7Fj`fkB2QbB!?HLM{00vp}6^2VzB4?r<1!|Nj$30ymj#|?vcUDy5{C|VYJDUJsN zB?ytBt1$L$S9^%jHv21}0(D4i&<4y<3X&SLaV67--v(&qH7i;z^zeRedT;1c zi)REPYxS`amRml9_)CAMCE*2&vwi)WZ-rayf2?!FLVrHDW?TWYqA_G)+pY)O6tn={ z!R&=jm}VG1uBKt-muqP?j<%s(3`NtX+cbTn1IvfR*ysgJ-_^0+;yr>?lKgWqG~ z_fOOD8n8d{B*F4YKH1B`lK7Fd0gUc7;}8Oqc#QNyi{8JOND5EwCNd2}rT``|n(4%YAxqYD|H93vC+Zk}KE-GNoT`o+T? z1zW3b=+ta=JFATthrcogfR4zw4f~CnPF31NdUMqcCm@~Ofw|3)kMs}SOIV!i`aHqJ}5qs_9JbH}x;|pGj*o$jx}Gi-i}89GLErhnykT zpe_;~f>kiyiIT22z{V@Nn8cq%6U*UZqPrZ$x_3c`4->AI@>E=}*S}@DgOY zKnU}f$g~of!cKKBW$jK`*_r5#qn7aivi>5TOUUvjk$kiyYwU(5S0m79q1EI=?ZBa^ z20H{WZ@XY3R3URddZN1mQ7_rvd3u&qbsgf8vRv zp4Fn;DM?j4I|$VDosKhzUvrTr$pB%Ve*!u;(g zGvlo?TGN9eJmtW*X52=+vHU?ck}J}9yu-^(yeci*BgN~h1cw1hwXn{$i`Sf{0^}I zRbYXF6R(X(_c781a+GRbNjWxgjts6+G0MkV$0KFoPvaH}Uj;HSOB9Yb7$zeZ$Hi8f zXS(!RL&t_lqnP8@Zs36at@ZrpEa>J~t=Z@=lwoB5yW?Y{x)B@N)y~-D zv+XrC-q6!b@NDgUW{am8%)EFnmd0Hyd{e#7~Rs`YsGh z3=bjLn4)iR^@7=Bkv2fF3;TmOaR2&5n&s{-mfP2-4gSD2mIj5C?Au95MvjknbB<4T z4lQ47cVWk{{Ty1jr-TmuvtOT}Cr;-6iO4z^n1hd&@nhm`yhN9}T(pm0Q^*f85z-gI zSjG*nlJ%NahxsA?i}bxj`i)2*SU^c9h@^gFAuht{Td@z1^Qs|i^|AH~E)^r7&Vp&2 zU|PyT1Xd8+qlg!%Xhg$w&!td_I%xWV>foQ1Df))0;inF8p9Ob)6~P!wWsd3%F+;Yv zPQ})J%j+j*4DM`kVIyVJ8?t|7Z8kkk`nF8;PY z5rN%}jjqR1g6k|~4M-W%p$?;vd8A>VOk4g1zoJ)}{3Dy~&B$xHWjz>V>3^<^Ng%WS zW?TzC7|UP}t;LgwlvqorTWiJrdL!lqa>KZ87=Xin5P;Y49oh(knocb|mwO>Vf=oxZjfHp%RpnQu8L0IHnhP7(?9Yb#7xGt|I4~yz%F+DU(dIwidvQn^77}ZSj1H zqN40fd#F!&j_&VsC9NSX%D(nmfYed@kw?Wta6K_Oa!r83jOkP(Kj zLFA=TYAV3!MrZN9LKSc63psvcKj3ImKF^)K}?;#Qa8=PdZe zVR>{zWz&`paC~(z=RVVnM=g0d>8nWE^s!L{0mL{-JjgyWWm1;@USJ7|K+DEzLm$*p zS(^6Pk^taMj5p%;j-%S*u_V2J!7Il!k4HwW2y^&^lpvkDiJg#m8i&oO^~O|K4=eE} zBau07Ltcj;uPBLqRd^)o&xsKa$@@J+iRqD^_eC@g%j@7u&AS_{HQefoAKua8JptDo zF>5TZcWW zQ*;}us=z$sBZuG7PX7##Y}Th~pE;^`;Gr~TWa9lpv=lr$vAjOHdxC%L;i}(b=69?* z+Qs615*68!5o>|6cKDLHl$Bh{XFY#ex?7gA4Poo>dQr+YRIi=>0ZQ4BrhO1D#r-hc z=aEtDK5vS+5PQCG*DWZ*xb^wwX4aZA|M0!Vw!oNhIM2}Z28;I?662Ww8BI>p!sAV> zB4x)kk;l)15k^+sa6tm3#uU%uxs7sR5rS45$Vp#%3T+G3`JGq`hH-VCOewQq67&;!j!MnsRe zrUv}pjb-f(XJY>ltc#UY|D+gqlfysxOpTe4-2+chDYQ>*9k`^g7XA=gL(4tcI=?$L zADKmxPXO}55^Zx6DGnRwag(KSz)YADR=SwG{>1t;s%olL_)gzGVc$6gwL!8MkuTH_ zSycZf4SYp2;5fsA`wrg<8(Ae{5=2|Y=^+?!a{&@>j*@=BH9$)7Cyo_`)4l>224Q(#b`OM7$3q)+;d zOv(O&)4Q+%3Vcp-vI$P9;1n7G0~j-Kr9AI!ZTLgZIMuTDRP-`ARY#Lu1^%%ORlj#! zFsgb_GBzeR^`?HpL6GH^8=&24BiZ5aemiYm?w;dVdKAShS#l}&+-uRPyFccxOZ30| z-UWR7A`a&6471;^S|Z+V+a<>dS^V>C^X;r1i?HYp8(A{24^=V>pQMJ@REOg^;}#Gy zpRyY^M~QTJJCP=9{JVu|5E;*_D{rvz;s*T`5x-pd`v`i?lHadHZ`ox9 zV#Y+2V{Lh<@&bCFaxi^m@Jp3B;-$*T#+*lBaJ*C*DjA6M{Ay;s zv54nm6N@j6?;n@_74KKZh>Lnf#Xuh0Qvk?D_-S&LQQ5*nS^Y}xug-#9e&&9VNaa;&sf{H_o-pkd??G5wm-PN&}m*@NE& zaa4cPKD+NeLr49y#QvZ?xQU_0R2ddrD;M|-k%Lk4a9Anwu)@&5Yu>$n@$=cny^6F&CxWM zZXzSe5teLU$gWfNir{IeZHwpR9lp57$yL9MDGZPUY8#}t0*@V#no*%zS-k(YkW zmV`CCVA^#(oa1}ynTSyO1}-1qc@NT~Eaqj*K!T>1jPK#T6tBhnL%jcuy2ryC=y~rQ zw!Ap<^A$Gw(-qme$5zhg6}E*qyVB^lcdBq@>AS*K!{<9}wS0cSMi2WeeIK(0`TU%% zfzPkn8u|Q|?EuaZ`gH(E368@{R{VaH!|$;rLk=1srooE>Rj4D48teF+n141uyNst% zC_T4?cY16&-~m%(F4Q?$;&^EWJs=GbIi)4zOY&tahJ7?&V>`0tiNb`+ij=}&%Mcr2 zuREgiO!TQ{~DimOhLL$fC{jQJxQ$)j^AR6vAYPdQ!rr`&DH1Zn!pwBe; z12;(iwG`CCcPcuo-u4sw$vp%oG`sAmJa;V{u{=@Ce;$2Sq!>(tTAl%kqA7eNC7=d69CM5Vt>OH1& zZMzQ+63>f5Y#jAVR)Qp3-6B zp*7Wn*9-lLUmyxYkKIK2RY)H=g;Mu4Q;T|!1j+j`C??e)K}4Hb1^_SmG%xIp-S(KQ z;LR%x+-XUm86Ro}bT+#2BjeqN{#3V~|G~)dZ~TGB$KtJy zKk#Bl^h+WeP%PYn_cygXhO>AVQ}L!JgkSVP-P2JvHB;C75abhhB@d}!eIKVaQm-m$XLBTzG=(f z=)YqOMn$8W^=eJ_qy(+w0Lh> z%%GZus_3r((NZGagb9AR9}FTM1JH6ayR~UP*7wE9q=Eg$1m@mU%k5h{Ww>1M?;!Zc zgTGvdzL|>wIDp)GEUsGO-9PS+xeZ!2cR;*=YdkN|Kb9IN3sj9*jB$-? z1lHK}qci{;dzXm)7Vpa}Xrv0U2tf?xpqaO%u)g%qDDMX_9j&I6bDGUDz|!?^j1JH& zG@PYxG5!SiEPuucFuC{{dqID@U0 zIEAy?PvO#}&7Y?{4}|l4kGZ*M;i;!`35ix@q92B7M7RzFks5;*`pvl=MiFvN=c29- zJxBTciN6kK;bPh7_U9?fGgQE^Z~@gK;Wm-*GLg`OgowN}3O5qMNvDaV4~eAHxDNK| zBIz1R`oleDRr@2UaRPFR`A%C8FL|Q`eyu|PV{KJ?ywZ`8J${Ta9$aXc&_i!eVIbn^ zXczX0jl3tL={w^wB&B%vU-5tO_J7SkL-L0OlGawi-B`kiNEi=Km?=3@RE(7OG>YLH zYmlZd8Wi5gbAP>>)@+(SHw?I|+5Zs#Xs+Jo)g6#>VS;h>D)x7tTCQ(wA610hIC(GF zLD|%nspYiBZ=nNC)HTezeeosUcNSh|xpTec^>^0q?qrN(`D0EC9&V^^>=ebnA6G%n zTRVbz)6N=}tm(6Gd8y*}3`O~QUQ7C@|%Ae6v{!BD3av=P8Tk~qH=4qBp zydLJ_Jr6SeU&KcyR7To+U?j*W5Qx9c&NKsVo+SH8j>r}#+3`uff>?j&DjLiMqB_!Wsw`2=-%$5^I#+#ya8`WU&p8H=03|42p@N%isw$ ztMS#$fB8`0Tm=8+L!;{*xQ}T(ub<*6S}XN8-bQ<)iI2N;^^M(QTs^CA)M8x2BHJBi z)(*IfP=wE&wDedr0`C0&B=Sra(6yC2?WX#o8rBc)V9|d)$|nPFJp5JD^KY`>K4IJd zFN#=i5bI5?{?;VsFt3q5<$lJe=^8)rj^ABpRP2FWMc4Xw(B)uUt);IldPQcJ(RC&D z!n=&8*>T-&OjU^->*p$Oc4udi#j||fvD=s5Fioi84A`IYIvICgn&r;TX_nXb9sY)V zV{q6vbc6o9%qICGk#c?n&Q-|O^og*;+g#uJnzzY5P3sH~6wT`kVt+$gM2sDbXiYOV zQ7@XYSv*`XuB{j+M%56jpu3ELs0y9y*^2H-(m8h-qeYcxIeg>r*FL8}L8#zQ#y~h2 z(xZZ(!Zw&L?L)8;ZJBDpkQixaJ53 z6&M}72_9?6=xDl*s5j8@%i==dG%0VR@f_6tkMg3tF(_}8Qr;za;Ee)Z15u>o?}|+1 zBE$YB^z@Z+5z1|=KgB9QALGKm>#mNk$M0{`0so%gNH2tI%agfBwe! zdd0sg&vkZG5bMhg@KPPQ#d=x9m?R=@ETSU*seQJ=A2H+~`JeW`=YO>#KSEi!$BU3Z zAcXLc+FETdRo-H~)=0cQc!IIR5Q4t|I2@^N2S012(%R(ZJx(#eK1;f_WPj z>FVS0yqEuCPG@5kh1+^e(b^*#yK)*B*JaSqy|%5E~dmmG2&ACD*h z4L|a)*4xJiDC8g8$L}Ih+x}~u>iHY5mD7t?z=iy!6?;LIh)3S-h&zZy{Sn^#8im!7 z)sN6-D{*}?vv0FM>f=4M*k+7CqBeEfnOm3WFTXcp|AC)T2?5*ec#gNvtGBdm@5uW1 zM*Vvl-+P;z*B^n^A}W47->bk^ZU3$6H?qGFi#$~8jc}mc3?|LqD>P29fql^mJ{0KO+;lRZorZf?n*YqTLoBp_>*Kx-^e_t#pztt@Hn^tY24-`xPt@i=QmuMNgot^s^YF5^!~fKB_ZLN&2XGoVx3yVsT0^mo^aQRs3&$!Qd<|`IKE}&&eGO7QV}M{X!cYU{v5||K}E64c|1c? z2Q1(JY42;GklZNePkkTzIiMC9>Ul7%ofnV+7XX=(mEe-^SuPpf~X z8})RzyL(16{s{_lgc58s2CY%RX2nLx1IT7@gb*9D47L%n*GU9%bi}T6jsjk-cGqDi zV6?(Kf>^?Q;yVb9rf*6&DjU){QO>sHmhw{BIx_bPH})>V9CnOxC@FE#zr z++6ze?leZAv@NiGGMP*i0_kF)Z=|QkyIsXnv@}^Hx^#9bnn}l-0~M4WTj~&peiRbp z=^`{Kkjxc!sPS^ujAhm1t@MhNsiHsB=MMhT3L1TqEjyXXs5W@e71QI{Xa-G4=Il+b z>hJMPZaXM5iKztW(dtYlF&>393d7p$(yyS;mIB3{=~66ZD~C>-ggz-Na4DxNDke&$ zbQZRhi>vlPZE}S*DyfTB$vKgjKp|K~u7Ck4#PTz^7b8?!7r8~*gT7yDxnHO5wY|u| zSMdjPs(upYrbQIq8d)(6=@+p;&9RD~?$?Os<<_s4Di3*VJVYLgy<3`8Re!m^T*}#u z)T6)0QNL}BUHDk*5BG)o%T6p`$i))HVpAcPD+O{pvx&m$KrxYxD}OPWEu}Mo3FJq| z6M=gt6O)NRtdNM7$en7I+Iz68(|YdolggjjnTUig&8xOM=ulQGQ7Ky75m1ID3@lyj z#!K11Qy#0|)~(mpcbPoh#?P+f7Z<9kyy|Z09}%L`8ftV!rz-<_w{0A4Op24SSc1mL ztE}o5TlrWjory=h#?-wZp?5UP#&X$GA(vqfQ*lQ&73q`7kV;_UC{nR_x|GgkZ=>lr zSBPXMCzSc`j25!x1o_2sBA(bjInF6@57qi&m8p~~dq+2Mh)LCWTuylz5HP4JAIl`7 z1=pT&>0Wf#Z@5FM{Otur@y_eCxRyzB0Zg>ojpw}Q=b4Bb`l`6vH$u7BLfdysCE+kzYKKj3S`cb*flmjjCEj zsFudsySKW(bR1bk)Q&b^YII!Wl!&V`f$MP4%@gSmK~JRI@T&0>e$m3XL}V(?bLyqg zO%yMWPKCF`kwQWWjxvetcu9w36^RLDQEH1#@{m@^3(bb}O^+ug@}(IHj-(GQV<5`z z=C2ad(TQ|6p~e*pu`H(gDDk7PG#VohUFIlkD!#P3L#yPIezNqZ()Y{gPggpbDl6TW zL4*=0L3HZX#J!Vb>bXEmGxx9_?kc$~c6yN>t27mQEN;JgR9y5@`5hIkTS68dL^r=- z(IeM(U#`4Z*i1yR&MmJzb)8l1pDHsf6(+MWis~$Zx{LC_yj@t^;}8`ds?~R#W)QU~ zPA3CdM3;$aIFV&`e^g%XSJvC~@(G!yov(`$@q)LiT^@eRw#<{6SQHvvRKM#AOtq4J~!O)HD51Cz8{z`R@-h+nFj4{X9vV1(Tc@D1zTh4DgeGOzA+ z$5geaL)$j5&2_L(c4A{Ia=8w_+V2SDvatj$I0`d?m2owv7G3rAU*T62Kl{FpUUvj8 zJxx@#n_MCn%jGZX&Xl24mse}p@`#Dbh+|NN-0Yd_#2lMV#427npD3iF`J%QFH66e< zu&9d9RQ45wHy~{>d3Cwclqz{uyca3uB3gjk0>ykLjUn3}#iY&7vq}5ybewj+)Wmr+ z#VR*L9qB)*>S}JWXZut=!?__fahGe^F(q}V2}L;prRx{xL-L}hin&ap%<5k!uy?;W z>$$kd9{VIy#N+ zV22SL95s4_9U*+!qrZ>tdipzX#FM@s=p5|m9FSCKsFw&j`g?nIHb8i|e~^&vLw5}j z40eXvMtX+ncBq5?4)g)^^cca;(cWHTbLU`RXOGbnZtv|3hTBMrj)4)QZK$KOFBsfx z4E7I^=n%6X@(p*gSObIop>Pj&?~UMq(S}wG58+>Mq!+0$G3Ws*IBbMEQKiw_ZiIVB z@PEMQA_O%KRkihY4z_Vs=$zrsPE-~i8Zd^!s1$7r;)pKP)Y02DXtcNYZ-LD1BV7hf z`4e=$O4<(x8_hK@7=k{lKNr9eE_}b z)s&FW71b_VAfHpeph@i_Zgt$t)!;{?i9jlfRnG| zU)Mrj9u{}M?G|V8FMbrcRw5`DIi3CJRBUB^F6+4R__LV0+(iOquW{txx74s8;LGHr zdi7uaiO4;k6aArbs)&mX5WZ|ulS+W9aS}eM92`JAT|iKpLkwTBY~2B#P(Ozkyei`WsE zdD5$m-#}G?Dtg72_DYhu$*i>6MGpK95i4cN9l$$=5%u^-Jwvo7OSuFuwm9<{iXCqr=+RU1fE`>g#mkt-zhFL0B2Y;a?pJCQD}38+LwB~-r= z`(PT|eDvFt9#^(L7kQ{XQF@L{I;^TM%1pds6;jBWch!FA#wB>~8~1fXrrC=7>20Pt zhEH{Lf7CSR@UDF07*3`Iem~$W;NI<~NoS!p#Z0paFa=284mt;zCwv@qga@1loC7>b z_=IU*0BlT}W?(5kn+VtnST}B((|~h;bAXL0(>wt<{V~&Q!2`Y18Phxfn9AZ&TENry zn&vKiyQHp&_b&-fo914;kA7hW`~kPzXPS*zi!?p}IRH-pE&$d&XqsmTk4JJF>+$Xr z;6}jxyHGFSUOc6n$II_2JV8yLy4->%%MG;u7~UaB0dBzyC8sG3Sm#Io;Pul3fOCK+ z3F6f|1MgzZ;l0onf_N`>j38c$o+XHP^A`Xc_d^bX_&P%)`g!l?!3S^-UqU$tcVuR_!RmBup z1Ke^Le6Iz+uY(`KdB_cz`i5yP5_}fpflmn6{TarC;5Sj8;8E1`4zwFjLk|GXeH;A` zcmnTDoTN0~g090yD7wD`{(y6M`F9K8x$i<=!Y`n`ga@p{mp1CY2RgtlfZc$n0aJjf z=O8!WF~B*%y5~_p;1I7 z{$-R0oCaJ3JPp`@?-XtPOOyxP4>$nW^jDA*a4+C7z;l470lWVidJDK8uM%m+m@B(qWmS0=K}KA>-@mSD%q-i^i9)7_{?d}_jQAyr+%x2Ox4o^x;*GU$aFai z0o6liBF`eBmd+jU$G( zi=cCphj7>9W$FgZ4PPZb7=?#@ZT`kRwHTG%OQ;*im-z$0V8k&#otp0&kPf5VUX**1 z${}|TmD^oQMPkboJ>+0|lH(!JUjTiY=t4FPP2%e&$1bZJ!Fu0r zq&JCN-SxkZWKfbr^}b=G`zvXxRO{yQTEl3K!Pj#=pCcy;pVnXRk^FwtFdG{J(z$VbSQlc*KaB7M5x|)8KnDAuKQ!ANe7ctdoD5#LI18A->+PU zp{w3E>fc+F@Xyxt0%9C$eEXK+t_PU@dS5GU+i@HA??>i=noj>5;C#&%BE(23y6sXj z!H6DE_kg+w)SE$l7g5i0<|fW0Bt)l+d|n)}(R9`{{|k@vBcxXk!yW>A$nWjGVl%|> z9cDQO!MPKhJN?J-f1zdsnS;pdN3B~G4T!szDenlo(%zA+TKl_KP~U|d<_Ho1u6P>e z#O9{_@EgeAm^aNov&++6mM7?M^?ejIbSUl9HH7QK{_cMNz(@VvZT?n_Srhy~x&F=d zzL?71sZ>=Z4kU{LX>t54*n&eXR1iD~KyQN?~RHC87=AWJby-}$6r=+DF zq+j1begOGb+V!g&e&!jAP7V7v_WN76`ZvN1l%IYBjqY zB+R!a43|~mNf^cRK_;WJT|kbZ>`n_mBJ2eYZHVK44;)g%PSgCmOZQtCll)6GVdpWC zV>);5qV{^<$TCG0X1Ai|<0GhNFV+S>MV(rY`d}}<1J((ST7@B=RdJ*$bWL#R=_s@!m`@t8pdn` zY$wdS2(ukDeN5xKz$8+Lq*m>I0CDAk-KIH8h)TPu9YJS1sNIJ_e-iY+wA1(aclmse zmJ75iRU!IUiT^{FPrn=`^H?KZF8|v>--@;5&t2sCR9PNv=Oj-C^bMc9JpL3HJpuYN zpwAM2QYnPg5>G4thx`_-VShpS@X{QgfpYf})hVB}h7Y=5f*^eZcq^^+0wS@8)W+5-thtVokhE>f^`;g!G zm`SHosQd>k`84iFkl&4bI;F!h?&Y}rRcpUn;{DfAZtOEydoy1jW^tMKNUr7B>uC6_ zX;K?Wt}L=8ZlHSCAio9qzwoH{C)Rqkx~cw(qud4TozMYuByUyx(}VGucXSt5OmZDW z`Hk3Xxexcs4lHt$tS=ycKk{F;j|W=7etyt@vaa@<&H>p3Gf?r_671PDVbA6R96$4X zkNCMN?(g#-SW)|md)(hy@4MeWx1!%azaj$I2GTn@9p^N32X&TE--R!*A8^<$DczT? z{b=cj!zj1#glWDA9;zR(@R9NB0qSMs=btpq^ORrVtMmd3XWy8WZ+HEOe>#sDZ8K)H zPRwwfh%Gx2V~+UqC{A)-k3Flv7ftgC7k;#ud(g_Si+U3=0R9i7++LJ>!#>`Xd6zg9H1!20Z-Z-FQq$jU~&m#C7 z^3sz{{`uv#^*)Cts>qQ1#2W0)js0oWJWXAJ_=Wr&=6kGR=%RkO2lN5#<^7I}y-)#q^RSAv9Venf7zaLWt6+cDwV@=ET2kXZq-o?^X?|-1& ze(WJ0pmMamf(5e7Bh;=X*h6eQgn5|cwDX1kXTGe5_kd(Q=)0dX&D*#iz5O;kh;ukX z^}cQX?r#48Rth25{iG}#kO1!+@qZuqH+{u4sVp}M# z==Ot7t%smr_o4aho>c!V!5-;pkuCz-=zGC-+-jMmvs4CFg;bYR5?wMkv`sza(V1M?{hC_a}PXh$_`5?cN*nh z=Wzrumw^UY`HPXdzG$m^+fj;dPX2ea1A`QaQAa`hnG2 z_49JKre+}|?AwR<9^C;g_{}KFct0(?F$X`JIFy~|AN!)?rpFzIyb<^Cy`6);J z4{;rGUzmEKci{vlIZxP0`mG+Y#U^7?q&>G~oBL7jUzhWolxC@#PC~q`&*TW}Q z)KK)F+F1f0I>YoM#0PpvdeBpkMLFxyKIET3{#~3;`_Y>$?L2~f`Yq=k&R6@<7CnCj z`Q6BuadPUx8_1tVzVLVU<8}D0j40oy0FZy5QFM=JH- z?LU5W$e%{OmwYAU??-;wjyoZrWj@)5{BsxKe+2m#kbf&=A^pLx zCARtM734SKw>=qutDVxk`3Ca4k^elE=lONacTmr-mT|luzj-Z!&aEBSq3G?%Z}^sJ zQlGqaz10J6^}t&_@c&8=G>q6KIjJU-3XYaKbp4iIt{IYkk0UK{xa1Ex^0ztCJ2;Ik z33Yvq)5|%%s1_LpJ$|CDCtlJJE?Hf5jW;NW#|G4eC9AsNYSq=q`QmdWe+lQ)Z})WJ zCmwaB@cTbq^iv958?h%&7q&*#^(&T-fq|(}U9=TKS89cphn9MD(Q_|!k^iTQwv_2Y zNTIITjT#F6*zf8zS8LGoH*{f3T3u3~Kt>z?WSTj;mwx{jPB$?>tX5RM$hFAn?W;A1 zUvc_=VfY({O*{@xwyU}|W&^_z!%>DwhEojpFnpZhA%;g89%p!p;aP^~87>X$`hJVy zDux>vh8T`AOfsBexQF563=c6p%J4YDQw+~CJkN0HCa$01Dux>vh8T`AOfsBexQF56 z3=c6p%J4YDQw+~CJkN0HX0D&%Dux>vh8T`AOfsBexQF563=c6p%J4YDQw+~CJkM~c z3bNow7_MTtfnkW@D8nSfDTaF(KF;tE!=nt3Gd#ucEW`5*m-ce|8LncufnkW@D8nSf zDTaF(KF;tE!=nt3Gd#ucEW`5*m-cb}3|BGSz%ayclwp$L6vI6XA7^-o;ZcUi8J=Qz zmf?AZOZ&NghN~EEU>IUJ$}q`ris2rHk25^P@F>IM3{NpU%kVtIr2||)!&MA7Fbpvq zWte0*#c&V9#~B`Cc$DFBhNl>wWq6+9(mT0+hN~EEU>IUJ$}q`ris2rHk25^P@F>IM z3{NpU%kVtIrGs2Q!&MA7FbpvqWvHqDFa8ba`HJQZ%n|CMdr3F{MAt3z<2pzBgO2p? zInqH#y4#T+a-_F7(qoQv+>uT>(ium3%8~wrBR%U#f6kHqk|TZCkv`%`f7g-zz9apz zBYo15KIKUNog;nLk^Toq`n)6kza44)kXt3*S)v{asHETKNWasOUg1c;+mXJ(k#2IN zn;q#}9O+vf>5Y!`haBl`j`n@nk>Bn}haBm!BYmeMy~UA^I@05g^bSWl=SUYF>79=B z#~o?0d@@YAx59^*UT}ZRX|i9sKFw*LP8y%%bSRYRw({MaN<8iKJgzSpZJejE+GCx zY}Kku{D-VB#DBQQY4IQL<+S(@+J~b{{Krb2G{k?b;cEkbletb>gpR=9r!@=y5<(cf>__YvH8}(+Ll{xZP~DPU7)EGU->Nb z4+RwdrnMWT&xEHF)*@<<^|Upw*6@|r(RO^TA?s~Beu*LLQak<%L)No){8B^K&362i zhODpc_&P)O73}z{j0RpO+VRVbF_(C()~Gjjx#E`_vQJ^>^EPAdB`ZeX*BXAK?w4-( ztBsalx#1fO+4r#K)*5d&WWU3Xzs7J|f7cqOEovtVxWFF^oA z^`7DOid{4?{)D3+=v@;xK209@+dS|g5BwJ3)j0kK-G1IaF}(Ue2l^W0YGc>6XrSYA zlk-8)<0Yvozc;J+RQsPN6~5Y!(X&aUw~Oz)!B*=L;7R^n>~AH0dDa8}yy8>s4}T2& zH8##V@e75Azxs|&uVosO@n^_jaqZKplgotW9lJl!}zL3 z8{cGn;9iY*n(@aKo)Z@EpFQyZz1;(HJx<>q%@pW+QB>(&|jga_jg~BtD z1-zH(^B>Vj>Cam|=))fPZNN9U=v&-_ep=z{jWfq}z4x$8pY)*rf(QN}^O=8EGa!4W z>(3eAa8#$eG%#Lb{Nfgk_#or|6XOs3KqI8z&M7<-S-_h@{}&p$f%C4yf>7y?gU>q^ z-ok)yS9{>sdf@Nyz~AYCr+0=(9wVnKBpFldgljao?`qi zQB3at7Z~5l<2%g!={;|fXIE4s{+`|9e`S0N1s=G5qJgmp{CoJ`3YIS=KGj#i1HVq; z>y5=Dx{T;sy9fO!@NdVw6X5wp;)jR_ebEE|hYDYBc{IE~M z>lEI?fNz@=zIxr%%zRq*YeMx5KW<-wsk`+{PPRC-na(F zBOZJXd*Gk*z@PHK{||*nzwtaP^87ExxBfs2B6hv>DlO+3wu^^Z&TAE(6Be+M=@*I` zsovScZPX)zOy4k}`KWhUaof)L`DZj#%)sJHO~6?KBiyz zmd1+z+3P|7B-4+5N7GBcea(aZyB_!x9(a0p%*`HuuJH9n!`F1Z6tB}|0#E(6!1E5& zTV2bPU1J{kbDe&T@$Y5){3kU+;?lJqet@1+Ci^|ZemKN@?)0Glk)PLHE z=U>!}pJ)EdIbIrIL%E&tUs$f?X((ufdIuM`vmX2pdEj64z`rW;+^6~fl}W6J&v2Y6{`N7(k9}MV$!^j3 z3gZX9sqw<+sKQ$s#J7L#fj`T98d922y+e=NfApYVf{lMSJ-Wj)P!gXfn! z82?ko&(CYb0md&XJSQxm_G;~C4jj}-@srmuKJbXf|ANb0&-nSrG(x=-irZTlKRd1Q z>K%351{pub^RkTB2;=h(d;h5L|6|SP=NcFjz?1)Parpl~Wcn612#J3lQ+O_H0bdjT zdo@zp`)$St7BpVuq<8*kUO4b&jc505yvq0nN4s8Ecui~l>&HZ|;=@mAtknD43U8&< z_qQqJL6}0zEJOqqR1w}KcUO3cR_JGDttbx@&BxW5o7!rwy#?mUjjbB_f{~g z=&cxiKgWFXPw6yQZ+zW@{&}WfaO@NQCF5J!pNs!L&G-O!yo}3#V|@OQuJ^o%hM>{S z-(Ks1U+sav9r%Fow7|`Z9{bf#YoyrYR>n8*x{s$-BO!S9!%s4=q6hy+nEt?VO}Lin zpJe=*?`eeijd2w(F_8s4#qD9a1bW0~c;6s$-5xVK6qmqpdd^Wq37)ZxM`0yfflK4hnqJXq9jBjEF)5#=#vA0;l znHl-!Th@XCNjlvol!=Zb1x6tpqY+6L!#F%Hp>GxIa@u;V#&|>@LlcgxLn&1G9y&1# zEObs!Zd?%Rgh~=ck#&>`nIUzgEv2bU2fk`qLP8x&GeF1Y^i57wh=q-!p3P8;|@Xg-rbw>5A0;QAJKBAoV+O-IxbED@YcQkXH41sv!QiBC>U z%%F%R6`>D1+eoWUi%`u->aj-)Ne5`4j68K8jV2m^zm89r;sc@jYoMH4oGIe`iP#Pt z5(~^Q&hze|qoL_5tkDb(W{wt{*Fm(sQ0-f#!7!@V8lLith(8XAv8OLTg(5#h5|`a_|iPMpEg*51=eyp{OLNG!FZwY3XJ zg>*#L;urDtO0?eMcs~xH zv5mg5&N@vEZo`4A#iCIx#c^1JyxMCpQ^|Mqk?ZpV;}X!%e!j&{%ye*>`W zemd?~)m!cdRf!x(Y=nzL5XEZ0_$DX**?Hr;p*UB|j*BGX(NfgVCs5h5NECf$)DVuX zF`!I^*p!__zGR9)8XrvLGdT1;s1KWLpTXxtjbMVz1ns$#jwB{A)frTwI68`a71hu- zREic#lX)YGp>c}G-JyDu$vbdrYZl}<7Qsat`VOg4Ie-XM`uK*5-0KYGcq$0enQ&2g zkPdyY6%D0JhsIKl4n?b#w^F7!2z~EWr(Fv=gkwX%5oft5(Q!*2*dRg|5~>io4r7VL zrlaIO&`0+r>~f;R@pW4}Zh~~tj?&XZx!-Z{MmlRTF={cKakN+(o-&{nIOon@o(?It zW2i^e*l{ZoaSR(yb=QNUbvsl@Bn;QX3bjpQPJjlGF3}&Q5)cu73kFVynnfI~s5p^Q zAyQ0~im_B8j*sd`C+xB*HPRv&QbwObXQvyWqj&Uxv`;3L?{ zsa>|w@;Iy3KxZ?=7s^?lCADv_@LAW4I3rQY??fL~u%v5{78!P2Ok0B;S=KTHCu&rc z$_>KOVKuR-vdTp8sp9RiNNloz5rf62#|x#2w8JixMPoSRF!}^3tI+-gRS94bmC$BoSMM<4WB=Y5XYlyT~O@Iaq?W+WkC(O&&X9#N{Ay45_Y<8F$H?NfSpB7<+*8`_<39)*tD zVs=_J1+`(S%E?!%0S_j&Pa;w$hI)~$=S@U&>Ez6CmvW&+a3f8jwX$~Gxg-v`x64U_ z-4!ilH7^=Ez1|>=0ym0)`w%r1q8TiChp>`Sqf2s)WR;JVepW{A>O`mtb=IY>gM#Lw zqqn49gdZ|*D??QV+L6@8LBSd|Q*|hnfXXNr+Cg$g!o$6h%8D3EJ6aD9(?OdCMPg8p zs-;9nv_wtP2YlN5h4@!uc&N%CW^}HUoN9kE7@eU>w(Mp@xq?gp88TNpKg|Cw(?+sd zZzGX$I?6+-9#*rcf#&ZneFC#FTu9R>z+V__|Gaur&6hyO*7$NuSaVgHfMkK)476Q*wt>{!&JB zd?pJL4bkDvRhBpA>C%AHXd7oTh%R4k-EkF(aOZi5PHdX^tp759F2{<_X1~C>57aZKckIa+)!Ugd3jD> zpuFEM`BGlmPy3lvz8(MRlIQsa%KPzt<7am~IIj{J-`~SUPYo+Qu zbQqDSO5(7TgIZ=>W z-hW)h@wnc3Pyo^AE_4$g>nus_V#rZ>fAnLSJ581stA0CA5*n?u@Pejo1-u7G6OK1= zl;UvWpwBDf6#Dxtt}=1F8bUN}Jg)D;F%U-qj!8KFjbl5G4{%iBpwA=Xq^-hv3l1`8 z5snvdT!LdA4(n5giy|DK;uwqL9UQZ8WaFUE>*BoH1KNM^f zuKOtTW4QL?Sczk*f;Zy&8A-pb;B>hF#~>U-aV*2J4F`SZ;CPQN+8Yco#lJ?--RbDsX{JgwYPZxlA5r-GYc{r}ZVSW0`i_SRTkK=J1x8QgH2YqsI zY{qd1j#F?vhvO3*^mziu`8bMke2&BVOtvoCId_zp)rVA;E-KN^b%%B^g8q!-^ceRn zI35?LS%h*^qG&NlCQ*{`{ecYI9JPSw>&S$xhIZvNk1&lYvuXxIR6cY z3r7l$dvR>VVaM?#jxjjWaCFCEeahv<4hxXyGMq1!*LUK)R9?4tT7$;1;!(Xn>LkKX z(VAZo=umlc3OWbj_Ll^3hr}O4ptq;zGITQHKSWN!=Mr(!9zdrjeBG-87%uUi;BT*A ztV2g{Pyb}8C$mZ@H~>ESWFUEw{v@g2ioaW>J$aNQ@%d4lw1?sZf0GQ1rzB4LNKZfp z;WHBVuzpD>r=v0Oxze9@IY<;pnJ?M@r)ZV`5WpuA|5)-LdO-jb%k=r4?G)GQ-%F+a z&9dPwe?FMdzTZBH(O$EW(322l_2=|f)oW&qdW~c~h=)yro%$AiPL09;YYh9pU^`FI z@}3p)qhvcykbcN}PT&*8NxMMScaE&@NQoCpdzz&^mfw2BsP7D^zge~m^?CYKNIglL z1#RiMB8HyRS&!gV|B@N)#qG=({^=b9{}}$DIGiKn(8@uj(hqhCERls@8Kd1lmi&ik z(n25U=gG2P)HDcqx;SY=WVwfA!0v@SeU?c*Yh}6n1k8E9kGxh-G#6;k4`X7~w?LL# zzfNee;;JkL|4FdtR4rKx$hb|Ch0kO?swLuDkI{~OWWAbYqO$zy<8qM?Ul;PXNR@8R zKW+8jBJIy~ihDz(-&RU|t&HboQgl-cdxmkoZL5E&w5LYeW3|_|82qQmJd-8|sdr!p zeSWj*6<{(nPA`)EF_Q*9d@d9x?dBNc#or{qodz6ywu_T?a*X!+F@~PEWIneBt=pyF z-j)5aW~)#4!P534Uw*d{f3lJt@Rz zy7b#;G3=+f>!x+lK9zQ|5dPdG{nI4GH$K> z|CTKGkQ@h#rNFN-;_by4@!T;+JNAyDhuV$(FUM2Mp2=1_ZW4xA{d%Fy!}jL|ezUam zKn#1%mhC>LQZT+R{kc1aon0heW}aZ<<#q?jhqdQI>n?X@OhY zKL=%;*8~J^`Tr>VK>2fB|K5DV%AZegykUN_K=A)Z`r&Pke~i~p3A~rYCrJM0LV;fn zKhkG-jQ-+*9QlDaF2HBa)2@lZ-%HkO?JKO7_X_ChhZy`z*dD>F{@okHpMyCbR8Dby zBTi)Jy2l0mxb*W-_@DeXNBRxvg+Kc_&w&4BL4PUnDl6V<1z^oT`^T_gq@+ZdVr;W0`)@>1tG90yCR{Y3(K3eMeleB-G>?kzu(q~f) z`va^;@Th-}alKSTTxZ9?kHygQVvKzLpBU}_g|y$=AKfHP_z%w1zIhu2FkRwXWq(O~ zLjYEPNs+kSFYsc?A20Q8d=}Uje=jMb*+b`=Xrr!{kK8(YrAwnskp6uApH|C z1VHmw`c(3|1M@f8j|7UnQ7+e_!sjz$)az+3H_T^U^^Xw`x5VfdcglQzNOlNo{{O!8 zL%oa#tG;*3{vLQqlz2cEe7ohJzX{-1iKoS=*L5-MxhF>adt%_vNIRQkds*|DV(I7T z_1zfYy`^pHX&=WS)*TlM^di|_=gW4?3<%sIg?^Fs&3jq^Yb8FIaY-nr3nYK0WVG_( zn^v5x63pLAnH-K!^i#R9b0ON5K69m=X>LJ(Chc@e{Cd#g1Wx-A^!c}i-y`Uy z5`SH`OTFBv`%T>DXBA|>eTx7xBz`*1c6qIy#7Sx<=;bn)O0~mH%Is{+}c5wD>Qw%6&xeACcw$o!e0o%IPWQR{(kGih+~+>}}b# z7i76}o)kB~wRj}H?p^`N;YWJ`im1Ldvji@eNjz_|Te8Y|fNb~X1_2J0_jby5w7)I@ zE1zVW6RqjKxz_*f?O^yPNJI$GvWOV4l8&y{jsVfCZyWxWEgMV6Z&`RnTi{xoR* z6co%`SW#Y3?JBHt6%=R%C8g!1T0zeA$pyvED(Ae?YL~NW`s8tC73I$9g>%cC;$HiA z3W}B#l7hmr(rcVrep$u58RcaaMGLg)6%}QZ-7e>n8RfYaJ<;hJUsY96HLjxAspVCb zmb>zuWhJ>4^Rxo8rf5My(fkDkC55GBEDrX((qi1Jtg4t-UbxVy6&4jatE;u@@=6je zDX3xSBF?96kmc07lVfJIprvZuBrB1Wr>iQXb5!3 z9n8yAsK5zaIFyfFI)y5X+7vCU)FzacQ;8Ckl$Ku#uaspKxk@X_wQ0`6VmQEE=7dSP z&PC3$2~`yfQKyp9dD=qf!fK}r?jn7aOK}0;(mg13JI5}~MVq4nQ&7pO!m=suh54>3 z;&oM&vEp`A<~z%a(K=(7POhGJvD@i}L!B-+d@`=eS?F?xP-rO^MZySiL82@ICT-k& zXVC&Eg8Kk#6WnEG+-_{*bZRKqHL%?Z2tuZoWEU=F0M08cBWN0|XE9W&x&k2!W1QuS zG;VlUpHq&QhO@|Ld8L(3y|BEv%vq&nRae7_`ITj*Mb5E>)#N)El~pxQV{IU(SBOy1 z!0239=~_z0;JQ$_JyI0GSvU{ffb~tfgtSR$Q7C3zRqpa4c&J#kLQXj%zpQLVwX-Ux zm@Abff}Dbgnuyy5)u~*JE}%`QaymokxcL=}%PGW)QIZIBLbPH)1?bc+^qsQOxkUv< z?y4$hIRc<`UX^QMX}LzqA(K^J%>KtEhYZ@hxWX-*J)so+r~(>FmcT-RS*~!ps?lXC zDg_FAsN;yHv{Yr4z~izbWWyfr_&H##(ulmYu&O*v<&>9HgsE)jTsKe6yY~8)bk=mk4^Ab793IC!&ffp;a-34)2DOsh_|`pn_oqWtFHL^|1L>+!)-yvzcfae zS#Vs*JgzYWra3Fi3X7cC&XPiRnQQFQ?9wW&qyjwJ^r}+o-=*dAqG}K}n%Ye(sdhRS zXeDrWwN`@Mi3~cUvKWy|mU2{1bRvjk51dd@WhL`*g)Zo)EIJK^ut6no+5*HS79* zh1K)OPXZ-R3An6?JBCO$6(|6#a1!+D6$_o@8#o=kQjRE4C(;;u6_m(uJf@)}@?y5L z3@Il1mKbOxUq15LG-uJGP_C2sw4y~?rE5O&WO2bl8X`qsjfB`?k&s)AtdU@`j4#p+ zk56Z{Ja%x?0Lm|#?<~%8p+zjoLJ$SWW2NQuofspliJ{F+Mk_0j8JEVmS6GFFWR)Tc zZnqd7+N)`P5N$xQ6QlloN-uJp=CNaXVf6y)&9PK+X0ucku}Gw5)hD7IEtT0^AVmpc zh)&gLBb=(xB;-ME+Qp%H13DMQDE@HMEht8yEabmLi$oood4R915)sF|m<5%hx1-7E zRK&r~O;m)Dhr&&s6^v3l%jR;&E91X(e}Xk+XeFU5mV@6A!7MYOdTBY*lK3k!H>GHK zcP!=)+FbfWa@2Pb!DZq^2-O71oTgVeA=z<0r#l?bGVODjtdxi9Y8tP@7)3gKB^{Zc zl~2=4cctLS8dv5lEC)qh7afW6WE6pqKD$U7S5Q5_0yAmOWCc!S$U>Aex4K%) zu(bm66BUD@1}e0PxjAFU6$~9bbZ|!KdU)t6{ru47u)#ya*Ws(dJn`?K#o@n>{5M{U z=QAO3xV9)QL=l2z1>X+ z+TSd(yKlP-E5DPpyChxpXOi!vJuK#M$$rmJnN@2UCiG>djKu{F%^`7lrqb_?#%*F7e7J{A`KWMB(R2ygmvaBJs6R z_$Z05i^9i9JP?IXlz3AVK1t%uQTSAeABw{BC9d_3^xJHSCq?0<61PX;izS{Gg)f(Q zW)!|k;(1Z{?Gm38h2Jmn$|(GCiPuEoPfNT$3V%W3YoqX2B)%>Re_i5%D13{=o1*Z4 zNxV4<|4`zGqVUfouJwuZ|6YkFMd4pb+#ZFuNIWeH|3TuJQTTC*=SAUh^uh-|bE5DB ziC0G9i4w1g!fg_-kHV8BzBUR^k@&hOyr;wiQFt$jH$~xnB;Fi_XUqB?io*Lzeywk$ z{|87sDGDDXaeEY=Ch@c=e1ya^qwp~j&x^u!iO-3`r%1dq3ZEwNwNdyb5?>dEUm@{8 z6kaItrYO8r;>}Tbg~Sg<;Vy}5sgeG?M&e0Pc#Xs}qwr9Ej>1FvISLQu=O{dspQG?l zevZOJ`8f&?<>x3ol%J#UP=1cWL-{!h59Q}5Jd~gNMfx+8pQG?levZOJ`8f&?<>x3o zl%J#UP=1cWL-{!h59Q}5Jd~fK@KAn^!bABv3J>MyC_I#(qwr9Ej>1FvISLQu=d&aI zAIi^Bcql(d;i3E-g@^KU6duaYQFtgnN8zFT9EFGSa}*xR&rx_NKS$xA{2Yac@^cg( z%Fj`FC_hKxq5K?$hw^jO^Se-fj>1FvISLQu=O{dspQG?levZOJ`8f&?<>x5e%Fk(X zz8Z(I^%)*v#h*+C_bT|>ULxKwnda{ZS+|~0aO>GN-CL*N^zOU$S+C&Mvl+SKkB6}(x&8x`DC@Er<%NWni=@S_U;se-TVA^d~Ke;WDJ`aGfF z)}tl5w@$&u3R6g|S8!U3TAzS|;}L1-vt7ZdZLCj|f``@~xVclou@DMq;PmX*`t(=u z6br;TO~HF8_y`66i-Kn=cuxh_75pp(&r@)_g3nZN;Sq6jj)M19_~$Els)AQ4cs~VS zqTpvMc#VRequ{F)yuX6iEBF8fzemB(Rq(Y6ex8Cqq2L1*e4T<1QtKl%y8?8(S;7CR@S6&LzJec8@R16BRKZ6nxYjcq{}(8DqJocB z@FWFyD0qs3k5O>Df@dmte+3__;AsjzPQgbg_;>}+RPYH3t}FON1X zw<@@);I}FGAq8Kp;71kwb_LhY3djE)3ZAIocPe<2g5Ra!DGGkKg4-4RUIp*3;P)wb znu4!U@DU3Bpn_*A_(KY=EBIOk&r|S+6?~?G|4qT?DEQ+FK3~EAuHcml{)B=rQSc`f zyhg#FQSem?{;Y!6EBHDEzemAeQ1G=1{-T0Eq2Mnm_&NpmEBJZ^-=N?D1%FM!w=4MT z3f`pP8x?$~g3DLu_-?a;Hz@q3f^SyvLkhk{!H+8VRt4AOy6Gff+Y~%e!M7`Tl7hdf z;3*3JmV(dRQOBDPQ1+P)?T?)QR!FMZoy@G$H;P)u_=L)XZ9!S)^32Q#FYq}@k zN-&3=j2~yI-I6GSrZwZ-5A9lOzk&E`>y?QsqIo6|Y;MJ;Umv35pfcFRbPu9w$s7zY z-HqsuM6YA|WTI(_99+wE2ci>*u4nq#2cSCG%byT8q?1cO-th7q2mDhJw^*@(rJJHQd_aT}F$zT)HJ&5i@bb#q@ME50n9n&WhO-th7TBbV?-H+&c zrjP9aeKyfGOn*o8IYd`7y`Sj*M9*P*AJGGd&SQEP(dQDK$@C7QX=xiwWBM(kX-ONj zGrfuEK}07py@BY#L~BexPxKI?5B5dOy)R(Q}yIM|2L+ zc}(vj`XZt;nchJ(Em?zUOut2RF41t z^r4@*{fVAKbTiX^h%O|$iRm6h(^51TV7eR8MMSS-`edSuiC)We2cn%s*E4;r33Lh3 zHB5g;^gN;~nchz{4M4#;Oz$JQl;}LBcM*LR(V0x|AbJ7OX-vOGbQ#fhrZ*A2kmw|) zHxOM;w8r%FL{|`f=qR^8(Un9uGyMqBR}j$uO~XdbT^`FiC)L_ z$wb!?y_V??M0<#?XZqNCpl=|$hUxE!zLDrkruP%Qis(5^?<3kEI*;jHMBhYoCeu5J z_7a`O^jk#VOthWpO+@>MPGWik(YFw-G5tKzw-SBm2)94c^+Y!_{Rq*w5#7Y}eMGM& zI>7X5qHia99n&`veFxENnZBOrJBhAm`Wm9|BD#j@Dx&Wux{~Ruh`xvDIZV$b`d*^* zn4U%SeMDz6J&ox5iB4nsBBCE4+RpS?qSp|e#PlelA0%31dMMEk5q;=Ksz1>s_2Dz| z6HVoo)Mrhfk&iiLTCwhp??wE7bfMa65GK*t}kI08btSXEy5|wXfU>J zpaDKTuu(|t(HpZ_GqiCw{Gk)OE5Fhy--}N4->_>$6?Z% zcKPhAD>i2tThasOX~aU4TVHYjy-oLJ7zm0y-RDS0XG;%^H}+;3rr(x;XC<$|u$i-3 zo-scs3kTbJQFx;8UYQzdv+}?bAe0K#rIPzm#;r&Y#%5fU-~y*1=rAat8RF(JTzdjG z549v!|3BtlxD!kp=(NL6DmKy|-8g72f@d)c|LI?Lt+6hZf;*n?#HX5XLr?k+-T2mM zG<#t{@&uM#2M@SVq9@?)WSrUUT|3#*u>+gb1#=eND=Z8-r!isgM|OU^<#y>Sw@`I1 zpS@Xn=Mm&4w8(idIv-w|gfr^U1wKQ|_-DZoXulK1&72h+h5!Kd3MK~dD`hoEEEN*F zKxG+$EaMaZJRwB+nJ4Au4Y(??^~z?|pL}T7X6U|z_i@%o9))XMe@5lIq4Kl@={mc$ zIot-bH&ZKwX_$5|$@wx?vzh_NA+I+a@2DGx^agW`9=A<@|F|m!3g3T6DC|NCtu~F; zSL3)0dNlVCz1C3x0aylmy69fVBcRM=xYN{8>um|?;EN;i8n{pUezT|#vCj^cfJ4x5 zp+hGFYY!wTA(Ne&2)|ML8JpL~LEf|5l?G%X5P0$|j2EcVrF?SK;{@Ml89UPh{=;B4 zPp2RV=-%n6iD=aB+_Bu95b;TuZ#DTRfjmW^4sqPw#=$g_-XwpDuop}S|jZX+i=2Qj_& zs7*IM&o+L_wM{Uyjs50sOWVAc86#|*396wkTlN+A>$D8{og=$gP*?A8{<=PrBP!h(nUl7roe!2f8eY! zQ1>QZ`=(t(j!xCj>#%`^Z2BhMo12<~0P7+G41Hot|IMh{3QT`sRVA!Ky$*vyU=BmV zF|VV<)fj;DT3Z^-hd^;JSNd!10ccdl0`voK0?kpQqB#MDrEE5@fQpj(6U*1O#w{8{ zkw~3ErvHnwjH9shqjI+N2)olU5tjPjK<&_Xpgs5=s&Dx%Lfmj{Z?bD{X~Rxtt0S*a zKW@kVr?%U*pPCxXUhP-nZl#=d(JKnqOBKQ@1_5(qNZ)O)f>3Z7Lf(89 zS5e~=+S_p4gxW+$eN@28SSy1DwY_ETZsRiD)9NbHJ(vi5;L6vH-*oS&EaL~D?$cV% zhW&c&<|NlB-FpEs4#jlAT#Z)14B@M6qbQMwoJfzkN3p+X!bCMl547x|GaPUm7E5s0 zMZy6c;STinli)A&*26RmwY+S;j0=&!c(&K3hma97bk7mp_}(>$t-X*;O=_{Tojs(T z-N?@U@BrEA9hcbBsa{$dG0+X8&Wj0)wD0g$Y~NEHst$bfj_gY&`wkvzZDj>5ubZdA zME1*p6if}!7-H1a4XDO^%5%ikPp`dDb6t4B@+od+kOL!(_9jwT_-mN(ip%xhnP|4=OI zi%wPZOT6vIez=JAyv=inY=>Rw<}iwsp}^5FG5B>b4#(0e5 z`-HiV&u4RAK@0^iMv*1;kpk`&R2qu8PB2~xMp3Y%1&IG6s7Vh57bBfT2~MC=eiEf* z3SomvX=!7W`!B@0SFk1v*0YHXnkd?GjT`vo!Qc0MuDyJF$Nh2Q@iBF*p@;xAHvD+K!edL^6+>} zJdk|ND)^g9xa>-sXB(QX2))%Fssy937l##)?)^Qw^Trkt49t@UsEgT1- z*dP__z7h(hGkuP`t+L33Q(%Jm3>u8ebWKGW{Y4oUi!$yDm$6usVMiIXmZiEWE%lLT zDRKd|6o!Rt*h=Mp!_n|J84U!jfy)pq{>FqBj!jK#d4~B>g~MQ%t?*zm*Y-xFsjTU6 z(w|@pufR+UEeoj~?toX(4hzu^LS951l`#@!AmSExHxh0^A;JHk@k8q|R5ME`e~P=d z;{{NdfWVu6r>N#gsOD!wW&y5PdpN&SZ}(q_8dJj?zfd1W^9MRaHUD}5XldWF)c(1? zr&H5FVi7>C&Z~hWxGe@R67_cF1!h~uiLd;uN~|1Z?Ks-+4E(t zExXygj+@k0=YgrDy^A?XAf8Vdd73q#i~4QQyphyXpx?!Tju5nYzG1$uYoL@?Q9;b4 zHc-FZSc5+>g%&IH;UQc_-?Eoth`n{9_@Do>VO~ zy$A|hB-FpxK>{j9aq4^^m6{My-jml}*~TH8hkmmWDTx}}N=dXDJ6y;LXU(INqX-JI zsHF+25Bz2eTt7bjNXyge`ZD=-Aw%OG?NAJ6E2udT-%pX)ixA7&i0HPW9P4%_&9^9@ z_>!*`8nQ*H-++>vJ-0y|)u@Jo;Bd&}`fd^t`7K+BO@>%LdK3+HcR>I)JcT=su!4?4!4;$cQuz8Y3WlCT{`;W^yvGobyr_Of$o7!3f-B!olP=i*>Kyi3 zG4!;oG=GE;_p})ES8qb~20LaODd-QMVKZL$>_%Me^d>w8c;rzR7SMO$%J^;MG1utY zqsMJ^o6-EW$68&>%}==PUE?EX1UApB;Go$fO@xBek#2c>h?+;yb|EqGbT=E%Mu=oWgm*CfTS($fk~nA% zWixG_8ECVX*RAnB(rCl+qTjCB>c)e|Y=A(0R(SqVdq7u)pzPEUNI;$)Ms8~NZ0z|a z+3KD{Kgq&U#rR$~KBAaRFR8cHJqpy@HMha8dET^n>R?e$QKKHWSs!WIZoCe$K0fZa zJ^))oSK~U%IDq^>{S6b*eZ2E?&@^EPCLL(|!RR~VJs>tOeVn>(-C0mqd-N3dNwv#T zlRCOe^&(h;S_}Vyc!pSJzr^gxFjjpY@^V7bD{RsQFk&po#_4?}u+_Y<0uA12!g<5eXKv0)kX4_H`GQefT514TqbV8peZm2wl)^i$q@jBM!gt@U<&akC<>47 zh>*)*g4b~={mqy`f4k=6FF9mjwikjF8g^q@YEQ$Jnu6|?*qCrK7|6ra(fT(g{PL0= zU#*8t$D2bAhf9%K(z5a}I6+UsZtz^{b)}}kpdm2m5_qdWY@Af2X%|yu>3;gnb&4kX zt!wj9s-DmB+Us@-_z3Trjh~Y18gPp~5Z9*bOE=FC2{PLqt-pmeyA9X?};&ocW}viJFNd3 zSr6k={8Pzz)U_un@ZK!^0~j!VB2{D;`NZe={xzw;`_VN7C+kmhmHo4LKRBJup8a3(iar@a8$KtjzK~DBH8U#8!cgRwS}K zDu!;`=}#K{$+Xp_p(%MnV{A2l-G!o$&_d>UbZ1&DY=VgHbEVn|oDFLT><1A_PbvC_ z_kPO%;&1nFv1j?N!+Psm-Po@Gw4eSq937)K96L!Lut_i7WK!Z9&~P!@w`(TCl43)r z)!CjrT#lwcI{lfRmkqUZMD!G!&%i;{{CM()J-PXa;#Xy#hMVcF1GGVJsH6SvPe4*H z;KvqkpgGI}R3FdP(lIVUawSRnl7|YZ3(#^9dW?kv*eCZt4-v4POFn}oGec8;_f;@q zzDT*&{vH$sXL9jx&1FTGLy@0;1rRE}M%Qk6Rm{@^W@l0E2EpD@lshn7F8#zKgq|m% z8_eA}DeVmtf1r)9cD8ZNJhIf-#KWE|p=MM%J;y@-3=J9X?l3$>?yuEc6Q|kk2()}^ zEb<5Qip;YQ!m28*4r$wiiDOukXY(clIUh#0LvS zS9ljBx`MwiyAVCXoQ}%PN)KGt@`$+9{6RvwrM?<;w)E|va414PR*gqQC5hSpiUqvY9bt83=wt4m#n zbFQx}H7WA(cO$(bkZ4wY3^SGI;#eD+R|3OeVf&|D%kf>8NQ_BlXZQ~}aMfN?i`<`> zYkd1gR}f|^%=S9Ef?F^80JDQ^V=uO!unA`Vh&v?n4cNwezQMj8MicA<3%N$VyYwp9 z|0&{?rY!K2F`fN+GW^M#0zZT1mHT1%%SrxPfyyGk4Hf+YG8v9JB$GUaSZO0+D=z%g zz>AF{PK@Rx$Xs?MZEKLn!Z~g(R(j-AWoAinNv?re<6#z!=VaI6e70*S7v(5os~Y^X zQ3x0he1hKkaCm+J=#vjI)jNsuLMv`^6ol_v^LcAbL(|>^A4N3nDgd;8_|kk3U780_ zv@eFE^nlnqnD`7l?!{~(;W=96dcMJI*TlGItD_|;Z@bme57B00j~|oZ<=1$x?E+`j zP^Wzr{*ZeCUt`nG^r8r~kj?W5O4W<16DNC6^Kp10e?hi)Qj+hIR=sG4t*#5yk%zl3 zK#4Q3OVWBis(Kt%MUSv!=KfSjiz{-;5)FJoT;og97PO|<6c4A!HS-q4WJ~U@xj2 zd=e&6oUkw4P)EMVgTVJ6gj~T-s;zlG>VGj+JY)5hbaM2CH&Fj&T>pPl{e!<8506iX zx0QnbCc*zbDDYoL{3`_i+X}z1_Abf-VQXjjZv}hSB~Uh8D0@aIdz_V-=R*%Q8_He* zU0~}f*uD~MD?s^IQ;A(giI;&(wx@99V_503BeO0+lyotF+!3z*b6_^(A;^Bh`ZdRR zm-1PT5z1$Hh^|C@b}H8Bxd!HM&wd0oo^Q|zyI?(vc0)|xG7@L3j%;;J@O>5y_&MI5 zsaVY3@&*9RAG2Pkz3P@TxV+lNINBZ_KN6c$wp-SKgEq+i1N-pyg4kB~PD&iGX~4Ez zZ%6$eM05+SaEf+dyDb6dBWTNWz>OK0`+3*|as)yQae#DyUX;9VDmUjTaFu@}%FQDe z@T%7_F^{}r?nEtSV({S!V`y?>?Q?DEuY`0x8Kc;53XU|m#{848;tb+N{|Lw?ncRXO=ffNM$_bBN=~h^pDW_{6?eevgBx&q93q!p z$(z>U{nLk~9ta{u<@tH91pA*trx!X~UamK<4VTtm+Aay?3{``5HXdll;@>GWTI5u03V>I|Dk;xwsc?{%QkSMn|$~Cv340`VX z0l`e`z~s$%7JtFL#E;y$NTj_ADw0+maw`Ifm#%@#mpHX;Q}BSSO%?vIl8U>h8wMG8^>be{rv=e}`eu9+YU-f}40=5UbB0 z-WKF*xJPqjJGv0wo1^Dz*!85j@OM&RED2zT;Go(37RlhYtbU?DA}*l6}c_IdsZ!izqz;%0Ci!%sRNvY(<*| zKcv_Qy)&EMYL=p!n6!6z6x)fYNhVfk=AGb-t`SDn6tkzSRTi?k2C9t_-B#Cw z8ZJ9^Nk0u?>Co&*A$9a`e&8Lsr(|Av_aw?IQ^1IJk$J^|nLgS5?upz`SD>Nh zH$ehvOV=r~u?b2G786R%cTqn|E18-(4(%LF`3-GnjkEU(4Lh)T2H{g6>>p1}eJ_N} z?!+Gam3ns#DxjISqm{9n+5w{#SBYEmSwsb@#cy^7dEm%R%@oheHozv)!bT%@!ZJ0~ z)m+}h!M5rtZj)@Z$qs)x@zEdM;FV#LIdi)R$!55Y@*(Eh$%&ZmWbqcj#xn4hWTQjR z(S1ugm^Vr8>zUh?nzVtGQ8&zclk~V2c@sK4DSbO-Gx_!%u!embSuFdwZ^372He%(@I^VG4+k`7U%&XQGFYWWl$H2oV6; zuq7A|f1)mSp}qlIQc$`Yl%HNhFzwK1-VYb?H=jg1Q2=sB%B9V_&&{4_9F(w%{(amW zyx3>Ep%41w*qYZyA4IKBw$;&l@FWELzd&q=P6%@}b1&3G)HsYFfED+R=Sn^*n!k&z z+F?<(JCUA(+Xds*g7Fyd8#%rJ<)?oQ$h;an<^j|=*l;WqC*&%Wojh6yB@3ZvL6PGq z(&4!CNjrUev7{bh)Dy@I^bc}A=9Lg`&E`&GM;*T8eB@Y2rFMffwkxw!2SVa|xHjwZ znZeyKk_V+ef5)J7HI%|lF*6A75bRNUG`)FZ%&nXHMlW!R=Pn z3Sk{AJI328jt@Xl)w+tRT>-O#yJxLVbkLIVAKBguDqeA6K%# zqFLc69fKRtIS@KUlxB2Bc*0pb#)UJ$1gUo~^l}<7-$6|e=ek&O5#F!85%6{w#<`XM<2o>9cNj#Mp0|mc3mDrDb5gxy220;%j zR~|6YN9r)TPR9s_Clz`wLNhNHS2JCsXybm4Z9r~nrES0+sWsrV4R|26p3je^uI2O7 zyx(OT@M3C!>DN-5aE@4)^v8q*Uj#^^WgG2m?%6~=(v3aMh_AL>jyVTegNaYQ?yF0s zM?bkf%7gSs^b;vfn`3A#?CaAT6-nU7xqBMUq4&bUVF!qmjqbYX2zH9@G+P>@{ri7V zf3!E(zu|vWe>sp=f0F)hE5G%7uKc%KXrNNRU-W;||Eaf9`)~XowZD-5Z(XqIJJJ4|BibKh zNt=}#J&H*5COmVuT{AjiAlQDtT{Al(Zla&U{1Y7JRY)k-Q<&udBA=#U5hve#uR_86 zeL*bPIO%?Xq}vQfP}2Q>ns5IkR%uti&fY$=oq!`VctfbYe9O|2-cAAkiL>qW!|d(r zkl`cN$NV<0aRl2|cvA*XS3WY{o@{IhdE)=Uc=L^|t`X+aPUfvRV<*#P%QlYC^t9kL z_Q-jW9+7V@t=!TKc5B@mehyp5ukTsoQLeF9D2sGZe~5-0WDHEDyLNst8l=|b|U7tkr=raU@1@J1~r4?F!o$MaON3xhPFX%-^v(A_gpD%)4HtY3Qzvw8ck{@xKC1 zuPbI6n|160lVWoRFk?^WMw{m-xF#Dfre>0Q!-;8L%lV3bZWGThuII-1%m5?Dx(>Xsi(0Jrc~A#%X40ht5*n94QsHBI3KI1Imt4l|)Z*N24<`>9r>rcZ~ z%b7fG8oP{d$!ot*nBhCiv;j>UQ98X8hCQyHa)WWWh!E_u(M#}%k0$iRja*Ff8KR3F zI$w#8>#49sR2T%cmUZFy2=dw{qnl`eS6RqkOY9Rv>?=`+2Elg?x0mB~@cEY$S8t0e zI5XV7;J;1q-z4~pgx(b-aS-{7%?O^&5{M`B6l4JiT_J=931bFBC=^t8ljw^i%E3h6 z&n zxDhL=y}BWxrd z1Cqb=0572uZ%$&r?Bn=y+z-9r8A$q$umLqt0TYuSj^L}N@~0@(JhmPZTUdg}4@W7= z$G0?}7k#bnN!)_sj#T&@$f0n&0R97{I4`!-O*3m+?bv9=sk7~lO`V$}8^xNxJ`W8T z-`bb&ySSUaUQ4Ba(Fim9pY;#S+Ge~EjD2>bTJctarROTr6WTwQqfdL^|FqS$LYW8O zBc%BoBB~-ECk!$(U!b7Oa59O_Y&@S7$cH*H?7_6e98L&c%rG{&`k>i|gV#^F3-8Bp z#{B;AOxxYtY^xhGN4)F0h$e-^){WRG@bR67wrqWA`(lR+Z(KUALP^Z18y|oFIPFqf z^SQ`67g>{!@cc2kfw%NBHi%%pJ)PoUJO+FE7LeKV6>7bf^&x%8|AK#-;4cEd-%YHC z(XqkmGgBZF;%}#Yc%DZCQSI@Li!T&2pwU8UhEVz^Ho|`sslAWYM#aDa>XP){m2s#x z@N0Ftv8iVC8H3$tAzZ&iGR6j^Ub`u73C$r_@Pbx5ZvP-P$PbH=56_qcBX^ zhiN0?2MKrs-=SB5kaizM0E&lIo_nA;=8i1lHEKK7`XZpU8QU3}@J+QkvM;h(bV`_s zFf9~i^8jUUSs)y|5+hC9>T#f z5TW)LjJw!I$L(OGpNydNYMvD;gX|0biOZ-#y`eeT5OQx_#Fe@na{fn1`)>1Vc!DV{Pw$QVZKW zKFFx}We7d7IC5PKhnYc2MZS!NDPk@37Ll}eqABtk$kHrzD?stWO8EJ&&GQ0P93d^| zrkua=ei9}!(6>YQA%770&-Bv_9F6hm^U#h$T~J8hu+mzdvYyu-rb^=d1R?HaZ#ud| zoaRvQCW)_sxL7AtiuQ%TsY2i>QQj8=X_40K2dJ6lU?=lCQE6DzX~B^YGj9ZJjgRf; zBi9hy%O}+ZT>VG)T{K$fuaR|zIE^ojapG3j(R~*m)@_e&s&3hK5&~bH%(>6B@=;ip zSziQe%LisB7$D|J;z9Y*8TmQJJARrpqi18cp+DwM%GPvUY6<|n$^J@0PtJ(+0Lj`p zrgc07wGyo1lhOlp>L%U<^x<77yfCR}oD{*vZn}?npaK@~+z1+Q{(O9>`^(TBiKd$DO-Sth) zvCb}}n4Mlc!w$szZsYG3zu)_ps1M@i89pG-PTY z>$N-a&>Jf<@%=6I6uik{dwF{IB;Q3p)%+Z{sB_J+t~Sr#QIT!g@isVXr#FpDxrIyl zxcA|edu1taA#MHCAWC@))$3rqi&8daYVU?haX*ak*)f*GwMFEG$Icbux&=j;Ly$q) zYvK8T?tP8h0@H@kyh1aYY@Xv_#xntGG}#qG^G)m`<;Jy8#P2{OjICKVFbP&;iD#?3 zjij&%LAwnUq=o-LAH$0l#%9;O2=#r45tuhjn8#BGpCbY0iG`B?Lx_XTi(bB=8?J+e z2yM!jPYS_fSzNEpBFDZnfCa~+C%!BMr;%VM670`isScMUk=!5z`v`~J4ngX_@8Twu z_arIg7%li6=5ff+t-(`qVGajQ{h!|d@YmqJTzG`OB5QU*E%Ew6p6+`(H4mQ3^*xqa zpBNS9JHbWo`tjFLaZ`Ltb`{PTv*;;L68%xWtwCR+rQ2{IF1?N~Z?bFnWeW`aj>EV# zKZRYC1Pc+djR~6oA*NBwWr%+~uATjy2zGo6^?BUT@njZnA>(xpe8D5h9D}qOL;_2 zyDZHEW}_+5N{{o z0)WG4DumxyAva0Lbq~uW3Ay`7&aKG#Nsix_%NWdqMsn+g;tjTcpDo7jAK^6dxOFy! zU!{Qn&s_V+{<1VApS%r%@YF+)L!4#|fjWB8dMADX!2b)R%rn^|$iA<31vzsTO~dp1$%JidnJZLLRv{Vo-N~z(bc9fBAa{i0$DV+wN>@ z+^$(6z8$nj&J%L@=d2zd&E6R!T69S&wWR+`RLP7-HL#pmhvRAU91v2D){p!n0qwj4 zOFzhi!5gkd@)#h0STUU6TxZ~SKeCKp`TM#D(~p>G=&sn{NTs=k4x8iU7NG%o*eza} zwdOUHz$it%%Gt&JHun(a%W2C}d2@q)x`n@-_VXC}ni_vU4Wnh(*N`)&p&WboOKAh? z`)DKRPd0xkEl+$YZMvEAH-rVglol+a20a%AhV#h@*3YhLv9Ir1fQZlWj>yI4J6d{n zmNzpo?%k2ws`Kef4g1WT$Ee!w@sukaD9T%n)V*KFThJZxD}#;Z0KRX#liu+BX=h}7 zy7BuaVs~ABFBpq^EG%}Jbx+9gO6-rVJg^>4jF@pf0e}|N_&v3{)U_Z6SCkdo>Wa|X zw8(z}rDD!R-;TOvDlp&t#N489XhZ8V>SFTYLk8~PcN6f982T9q(n7x-*=Qy|ie29_ znw_Pl<{IC4C!+zk#`{*vxzcv=J1ls?4Ks&`<=2c32K}PKe*B_>Ui(?c+yUQk{d4H; zL<|YIiK#{-;*SQsuTUQPUJ6p_hPAL{BjysCX>k6){BG4YkQ+PU4=g%OWR+F_U>UrB zhsdUB1;#3R!7Ibto5y1Dw$#gyQvpkc=qvHer1h7n+H%z5YqJ35qP#g^-DpR7=7ms- ztqQWK8vY`lYyXay*%PVEgXTlv$7zHRyLtj7dg5A)vvsM|0--QR*Ic=U8Y#5cr~yG* zekOn8<WXX;)g+wq8C7Udj7=qM4)1mlYYx(E!0Lt0&=$g3pCD!tzjJfF#i?nz_>IFQkKolPUzhlt zk?&l04mMg_r)rCO=_9vS_kf?yquQWW9bs+D!`L&k{P80H%4sN7A@hzx70Fm@2&P{? zeHw9%)|!kAWo+W|0si|TdX&Yw%(ubN@|;-*qsY%<{UN?Ti@GNw8t8fN9jS8?D9&$A zrGE+_*QiUa-hXwYJksAr#A8V#nfg# zzm|Fk=LrAm0MY}K@g*bvevs~~OHG3vG(pTn6!~jVN18RR<8yLcFMf8JNV8CSZi(ou zOU;80G&Ry`mo$mvOCuNoX@ba{w(MOL%XUorXuZbm$SqG4=7V-{iUvCh*Q3F@Q@WGy zfv!bW`THfN*uY zk$w*4uN;#2m4-a(IOP9OKeKsmf)R);3WZQ#L;IK)qw65FdW=J8H5mJ?;JRiF0xGm0 zh=lDu8GCJQa#;{Ed$916KPlCq`z{iuzYo(XN7U{@Hrr?3PnmI#nTo=K1qgm~7kW#0 zJUPL$z=z3h`i;9E@jDa9&=*tGH?lm^v1<^%qG6myU(y&)c|BrdUu=@l*l&HW6jPx> zvGO{`Cx^Eu1*!imYWp)_lGrd@55AFt?-9XQB=}AzzJq93vB^;%+As7acNY9(1^+Pc z`;&>g!QvM6-Um|te)Gx$^u7RLMvvhrkLKRzfgV6H3(#U^<)s$tpV5ce6Rf-9J#)RYvK_M_yI2b7%RC1u8F6! zITUswEcpGV@MAceXE_ybMS_OqUc7fW-jCR0?tMtq%JnZ8k}Zv<_7{d$!O%QdI=2T| zD!({$1(ca41o5@hyLenf`wRXaqGH3q?w?9*4PfK%Wtm^&%9_X39+;^#JiLq8#Y=^A zFfYL`G4!WBn>2bqX9T4}?CD_2MsG%D(wi#uqG2LbbCz7*kFv)c-PpE)u-aoo7LPQV zYBwe7affvJmBrNBgK=(?x&~g04~;YYSY+mAGoNjeU*7zbri`@>i7nes;Q!8Xm=#TE zHNJPyrljpgo~9bVng08^Uyg-B^h@J&*X6o54`~_iyWEF6o9RjTon$qBM@6ilh6q2Yd-nlZ~UuW^mR z9-DEDCSb?z;V{N_!}DxtqzkbLK?>!hnSVl>{bhVivgHqutj$x&JX_;{Wf}Or&3jQn z<6w@l&3pitIlk;paXBNm+OFruz-T01b!e6)yKaIPJEHf(DZh>uOYLm9kygE_XOQ#o zLs+zjOFLEOQz&yLPupklkC#KX@gbHCKo%Kdq;Tkdq}(~EzQVZS4i1mWByt}Wup7}YjKWvoi1FA3T#IW3V=7zZzn@swg;-|_*7pSK zDz1a$XN($1(oYeqC&W4ktmfk&#rihZi&gxh1b(f-_+?`a_Cwyt&Ye2ZJcu4ri{JF` zVRWQ45b<=hE62oUekY^l3o{!fP(J&w^cURuhxLC6Q#fC?lifY7>rXB%NAsK9d z2K+cx>w4_AX}0nBdYFqJTgxqr;cSI9o~~y*J`I;oo@#j~OtU?&nRV!=vHVM`@N=-= zkcT!#Z*PBLP4SSjxt?CNX+=B?|2=)?ckn<=Z8lj6P3YTeetFOD*k?Ykw0)ape;a*w z{I0%D;P~VAWuHi2WY3Xd*bNdqpW|gFzS7Xn-d^qUE^)4bwaeqg94)zg$=?$;R z@rz(EC7Tjaa1tV+em(xQ^1t*Za4v%Xr8o1`yKx`OY~J6*Q?7REZ@!NHMhj?nm)h&P z$GLjfUZ=&mE{N=RSo!J*7vTu6J7wkZR^;bJgUICBY@loVt7Jw?V@bXI4&&F@lz$lI zQv+}QX?M#ruX5ZzZZ1OP0ImSud_F&2QS&5WAFn%rU46onPTrbmz142{>C+ zd9LoAjDOv52E5lgtaSz#W6mv|>^R7L578#E{44ZT z!O}mjukJ+pXln!cCtl>AQ)D#1Pdz#pa=+(48Y%8*z;h>hZR^26VuL%Knrooh>P44> zjXZv~nX{ZGgSiIzRmSsY=BKy*q4LD7 zn^%x><@w}^o@eFL#NgNE1F-2qTTZ^R#zZ`i<$nRBi&;zQwviq^!^X{t`p6dd7g0VP zi1L##F49;R%m+WlyC@bKo5aM&+=Tu^?<2LBAC}vrH)yp+l6g>V zjW>Tx|G9?9_&&Cp<{z=+tuq<^d+~NY^_Zdf7ijSEq}Y`oi&#+mw;B)bHuL?*z#S`J zARPLeebEpJCJvfiv}UR4=`OfL{>k-o9*QF>8w8Hc+}r>WO=GN223%1+J$?U7%#sx<9|n= zSqlSW4m@q!=f#_1w|8Xy`=S25%`aEAwl*9^R*R^3TfJ96uiF3H^>5^OL47nO`29~b z|C3goc>V)}Mo_p9!xU`@*8k^Wl4d@t)PKtF&wE}(Vv8N`?dTup`LFyt{RH|W`?{4x z`aq)n{M0`GcW9AV{1M}^@bB$#t+^30RO>eW75$iBXv8aZtuEa-`UXuR9p@Wa+M;vK zevnopH6jZ7aHNWUTY8j!skh}_D_@EDeb>yECI5kVu-5-E>u1ipo~Ghz2J(jf@vhzI z`^xy!Zh5HSIcA@zDyiz)=EpjyCC}0RU7H_gqX|siY8t8dpNz8A-GFO;i^Y2qJ#4}Q zB6@!gk5V#m1FJc-Pt?nK=CC@R>$qm1bEGDeVH?UAM^A|C^kX37l5o}27}o`-6k}W> zPTh@h@i-+}TaQ?OTMNKHqdoHIUlI$C%VK%=Z}WBZ5@fwL?JuToY#8liwi7>_S0a9v zLp&_Y{l}Lua+Vzx570k4*>w=EgR4-L^fz06f~)XK>z2gLsI#l0qO53sVQKl0vWj^_ zszZO~S5&)(3?5AX6IfhY?s8U@6c#y|99%^HqR-{yr%jtWZMMB%^{9TuqwMbTvWlVw zb`r5Kbi15O>?MVzWzOP(_9|z!yUgY2hf8NwRe1%j24fFz$fCk3F1ozZ9WK6D6bdb( z6qIvm*0d=(QznK=sxGgrf>9*})%MELVtWZn36%w)puBLQ)6uVbAl>Cs+c%(%-ik_R zdHWiv2uowMda4S4+UQ$cRqArKRcQHu?JM|w)ot_^l~q)?qnE1w``X&^=jEkn0oQE% zTz5%{v&vptZJ#nDH#c^@s$GRHcQx^qmM!+vpR1EfBie^Xu=)!=4BomOV;;SE|&e{E)xek&Ec0 zG5kFbLBI0C5tA$Fs)0GlPvAtnDKd8%Tk58}0U zAIY1kN>O7XIU3v1SJ@Mkv;$fkPANH(7|Z0RXwo2cSf(M$*>PerHkKSu1Y_{5SfGq! z^oo%@ny$r$$a57WdizqTc8j!+qdv>_gEIA5mQTEYvHDUptmmOR)oS7c6J!GEU}FQ< zuxaj+dUN#OMQY6D-NhYVR=Kr3>Pf6)DCb5jXHSlPRJPogt1l+}V==7%3M(>`XG!_D z7X;?B6XQd%d?N0cz?SJex6oRmM6`Q+&L(1UO!BcnL8rtlDof+vaB@U18=+H~aU?mO z%nMfvYnDQ9bEqIAigZ8}siZtq->#e;vz_2@Ha(^@2~rlWrBE5!W&It=rxH~wj1|8$q>}V zw!o!_ml8e5m2$FlW>Gh&4XiZ1UK192h@<)RK8a3Lu6{XB)bMy>sAvtDM0PZm$?+P6 zhLLn0OO;rBs<=*fIIG3FNcncv^QUM&9W@Hv8O&u;Np!KTF>HUZ)2%lHXfjSaNqWFt zO=JIRv?%>2)3}*S>{~vaPk*{k5BP;#wr!U@bbq30nX!Sz<&eIpsa!ghD2V#EDO}Qy zg?jPkv$)DW`ygZwQFwqjmi4p+Hrn)38`I+nmCGcCl9>7!?|SFToa)0*bfb~p z&W-)i{+?)Gf2e0uzj|P0{Gq#6#^1LxSC1`XjAP&o7O8PJHAgG&Qj(VoG9C)5)rtJj z>R4_FYxF3OW(_n5n}~3-xzkPFgwd2%sE*>&ezsbwHCLUXwopj_4Xbc#YnN*8-k`#r z&7Ex2G8spsP!T+6KD2)rPj- zjcr}3tFxuME!5deTC_$ssph`cwvD0C2G!fsN2){39_ZKK#%e`+d)hm@u*t4M5!H-R z^!MRkXj3;rog|L3CY4p}vKxAiu2R;Zl*wl-we*NF^K zu25UcrVd40ABxv4O544ny=K)m(1ruf`g@z9R8IuG1pWCko=?ytiqTj$5nnwtJfb(h zqnTJXm(UL>O6)-_lcx$aHaM;FQvAl4i)MT&el5SaCMejjA>F-l0+aUGB(^_!g(aTh zQgV$Rzq{ZPxnr0Tr-z&_t?+E7yg#|TJ>^T51HJ*1Ui{&@EY5{_T5cqi==(r=BCj7* zly5+Pe(mow>73s63TD#!iI(Ip>X!OX&IS*0#)6|Un%@L-Q)64xsrtKbMyith%J4fh z_0Olc&+1?=G)sAZT+ZL~6F3^!iZie>dV1^8+p($fag~&>@JxZ#H0fjkxlvho#}LvGB!Z;f=Ny2Ixi`Q$Qez;0mPG8Ols3uvlJ~`XJ1-=Nmoo# zMu9?PQH#s9bDB{Vsza)kDoHP$9fPrteucCsNx|C)5i+<3mefa?d}-f;a+a1Uh+}m~ zREZq@yg@grM|qQR34i(ig!vgih!iP4J*z|>?9)1CJ(L;>(m)rvEzY+i%~;QC&Xzn2C85g`}sqdK;y< zW_CpylWf4$MVSqb2AXSTO>emJ_eu?1se%8WYry?HELeCH=6D{atH-r(TKE}Pe06>- z1BWuSP3o|?VUye;`!zwHC(CAC-BQ-mS-$wr67I%RiD$Ph^562}=02T-pN}PEU+Wsmthr=O07)RG|WR$1$pZ zNfBN2r)9ccO7fL*$)t*>yCr->qxeNgP;*J;)2K@XJrlg#`H@s%Bt!!k9F45SUQ_9 zPyQw{*dWI`cPxpgJ30{60bOq-K9t1v9X%5>?aJ0yMjt9qjDFW86;@Ig1tE(~?JC|} z`QBwq@Hqwa&Y1z*rf+>79klI(c*kcZX4_}*a`w?7+omsDu8-UHB(Q$iw$B5P18eb( zo$3+WZUoK%Hv^9Y>2pfeqqe;t7zWaN`Nx6uv81C(+orGZybi1f9(>rg2Z7;j_(?GE zC~zKFH*VW?_j?X94jRWS9`Ig-#OURk(wMcu(^*A>LE1Mm?Sb))V4oJ>hex58!Fw0^##Wk5BHNK7xFJGcQ0s za1!q;90%^l+sAeIIK$*uQ6IoMymXtP@YkU?@aO{c#djX+zJc-rCxH>*abN}*{wCTN zcp5kltbY-D11Ev!fyaTh_)bRMOVA$}21W>9hMvH4zy;vxW5@@e89DwpCudf;i`X5jk&iuAx4;C`Y5j{@snMS7wG zUk9E8*5UIu;cp{7@E~wAu=+bl58Mx20KN`91Kj*w==)~m4-5jUPuO-7a1=NQJP4d7 z`uA*m4!9Y(Ky=_4qW?FfzY+NXgTRBpCSdj7B0X?Da2hxYoTGT)0gpPaUqRa4OU?Y6nr-%@;8Uqh4E$BsM zTmpUO5x>^N2Olm*>+-EY{0YQQa(o~RhOU||4mO=n7x?P8+V*>xFYREUeCRCVFyhPQ zL-BhMKk5@t^?V5NGl*YI58|V5{GI{dk8p3=BZT%qsi-}|radUX-+=GnP)o*djic?-R?@X%C03;jU!8~ASh^Yv`?Fndka|mk**12$LA;=Vs}%zU6qt% zXsPCh9?VaAd<6XS;7<}isd_IeZiA5*2kA+~tAuTn+;VzMJM{?F1a={OkLVSy`4EC3 z3HH?l`VsCah9MId_046KuwjgGiu3s-_0Pwse|};~i&eje`sePdW^4Vft2?aeYbyIH zmRjpULr(=yp{KUgv{*COgss_YlE7wg5=Ic-%wgme!vCGfZ)ViCe~5PQmaD^>udb{_ z2E`g~uX)ruS{<<#s2;z}9l6mS?U(;p%h{T97Ql;Rk>^yc9R6l&Vdp*%bAk2bZ8eQ?|XD%IEF}W9SF1 zx<7W-Y%cVP_8RpnSob2``7zu6C6LOqNIQhA>59Ow$~HqsO<=&9sYqC}72QCzLq%Zk zQrvZc(o+*?!fgv~JFWeQoU3TF4g%*ZHWMRSO7m@%lnEwufx8RbUEtmT?)!*)mLu=s zNFqXYx~N~pF$W4}ZTla%oj*bSY7guoxSRamuB$dcjldz+qZg9fAi2#tivJ50n-JNH zxE|!XMe~5UW2yFzuq)#o*{Y4dJBj?}au}mX0l4Bxm=l|uBKy36_%n!qhT^rqhAP{Q zdkEE_X=+^`I2N7me(Fxa4lt~mV^MlMrX*)n1MA}Cj@`$k;G?XEZ z|0hw7%#>~a;L`QhiNW?;G;n4xpki1pUqvl7flW&_S0|gd>PJ3z!2zAyVcYcKXs`bY zS(y(womXp3;Mdk9JWoZ;nryZ*_bs(1;V&u%YC4@{br(eKh1}_#w*5HCmHruc+Nwv7 zO6Y!%ilea%CJw(0>K07}w@!Z&>F4@w`yNiu<4*^SDe&DwhitA1S;t{jCo29B*o;U@ zPxf&=))aGCQ#?R~qTiza-04V8vD*KJz<&b#|A2aT_^a~$bYW@W!LnVXy{7U*0qgkE zKI_ENEkJ0rw6dXsDGvc#h_ZnwTfwuDc>=F9izL!|Q8`|NUiBZf?E#{=%MtRHgZP(W zo=0n=U%L6btm#1Du|k3t-35sMF7VgyzI^@^_+S4W_&*E&<5;8pa~XX;UeL$bFX{6t z`1j9To<4g11OCxJw(W6}PyPo}R++DoU%L?{J&O2WP<(Oz1I}GbbftXE*~3k}guvR5 zbkl!=`3&0ykH_c_JkNz6*;UnJ%?1LGmyW*1m6IKO7ILcZwe4@Z^XK_jBmkEkEX==} zYc^V&dtkxV=4LCR=NGRb{V3M5%h{j#&9|EZzb;jS=k27&ie;G3eZsci$8vc5f*w46 zg{-E4?HQ3cJ+(`m(&OV9zol{%`{4%I(_v3D8v6%-?W}DdS|q>99?XqsflwF%O@`F2 zCy{>o)6kdlLu=)W&8o+p3y7aXd^^YQbi`A;Uyn8a3B=Rs5RP^)%zIyR*1Kaq-Gy}L z9>?05b9D|qt?OYJ=Xm4C=heD<@a(p&S9nJ zohOms^yk3TCHF*C_DM9CD?mf_)FXV4P}6Gm8>7Mzv{q9 z8F4|#AcMH<*LHct~+V;;#kD{NTs=&OY4it>} zSR>Fy_3%~jAAHKT-&)3AuzsqbR8&|b!~zwQ1%!`c0=0nY{|3I>3Zz*-wZ}5-sZAcR z?e9|t?)jX-dC@!g0$#B-P6yAgaL@CCnM+Z`kiqR?+ejIN(J;v`U@rb+QnlKewBc7fw}Ae!Bumj4pTcf`|rhnEpL{>O-)UL<}A z_Cn_uiC=;EGl*~C{2kK-ZO0+RpGUk+@kPIYn%K3Z!)nCvQ|yJ>DS}PgKhhn2#{x-#Q&Ch)As%Ghow-wQ~80_lJ3&X>ox!N8nTVXjDOhg+`4 ze#Mt;dkv*S>8KvK+vTto8rAijDY`#c;hkw@CaU#MeDrejQHwzlwO;>o1hQ3k7h@PkxH{ z8N}bmj z*ZaASettckIm{wn{EfFCtVR5SL%z2j`w_36$47cteokvf0kc1oM|=?Rh5XyW!1hVy z+l%-?#7q8d@(y11C?Mb?jje$J*Z+7hR0 zX1URaw*2&kJzlzKoj?~oTBM7XZge%>VNmGx?=oS|R@HGu;1Qs{BtJnWt6$r8n(n1t z{*}Y~-)T6otEJ;buk#!}S#RQh#o?R@{=5GTb31qiZ|C;i&A5;80OJwHV~nR5&oW+Q ztkRp7O1+VB72`U_cE$n5VaDx@yBYT}9$-Aec#QEB<5|Xwj8$P%vNtlWVqC}A&N#q0 z%($I#H{(9W1B^!)k1?KNJj-~Iv8t2vXI#a&j_*DgMt@u3}ur*v>e>ILx@6aW~^W#siE;7>_ZYVm!-uk+Etc=g+u`aUEkj;{f9@ z<95c~jQbc5Fdkt%#(0YHEaOGSsvgdtaTVh_#&*U5#$m?ojJp~4F&X@PvQp37_?Zf8q&W@Pz->6R!AscYnCb6R!4zul0m)@Pz9; z;Tt{Sw|c^F_k=?p{Z@M7Z}x<5^@JNe;q{*I`#s^XC%n-U?)8MluE_A@-iZUuFSG|a zOm<0EjKeZ-OLAD|n`sWqd^XSFT9aJu;IP>5G>5A=UcgU3v47zc`xpJh{zX5rf6-6u zU-a{{f6+(m|7y-q?7y1BV*l50SnPi(hsFMDI4t(RjKgC8Z{V=lzr|s(|7$rc_Am7* z_K!#J`V#xUj>BUAcr>dov45HWi~Zvfp1#EX@hD4QV*j!}5c`+;zSzI4f5iS}y(9K7 z>mjj!S^tRrzlHM?`~L$Di~YZq!(#tI4vYQ2jl*L9Z|AVs|2sG=_D|1S=o0&1X@W}Z ze-($t{%_*2*nd5T#r|*Ru-N}QIV|@7E)I+R-@;+B|64gM_TRu^vH#T^7W;4Hu-N~* zIV|?CpACT~_J1443y6Jnw6@+Etnb*k>E_@a4YxHkDhFY0!}^BXgKHY!eS71&HMa%p z+wc|PY)@ZM({Ej~Udk*oy;zCST-LMBxJngJvW|7rE0wHw-Sj0&)`f2RRZ7;AZhDoH zb*r0xbzyz$rdKOj*SqQ0s6k%Ox#>$)rp$b)Qq`#0vh-z2_6gi_-hg+}oD_3ksVsH+ zpUTm%Rp)+Dj$W%|-@zGMiI@IlzrjtvPL*5#R;u4G>=(HC;VpRH=BD4E=57+pE5=mP z%X^-S{PPm^^O7-#{Ah)`L7ibaVprs!0;td0ej`Y~+l5=Q>k8~%)UiI|$H`uZUkw=k zPZ`j^4BznA5#@W|Ps={Kla4}XH`%RKZ6mUE*=&Mu}mtuXmUcmVn|(}N!V z&j~-%h5w68zuv?Df~Mb~sy+I=!t@~1Wt=$0^pJ=DEX%L<$p0nN=iXuTpGBCi3N$3u z|1uAM4by8q^y@YK2G!`v_a>&V_0aDCUAN<-CO`e#cic8J|9TI9gy~HldX(vRdFW}- z%k{$@pi{o{)G-{_PNtvt&_Bs^@4nncoa z%IW_fK&O1Cxx8XW?_m0A%4KS>`J*`_NBmIi>!S z-ubz%Sd_wD75u?<0SB})|M?FaBD+8JAk*jn+R)c> zni0@tuILb>9`%v`7|UtO8gBg#6K6tGW`uADZo0)$8IYW^7)~%Y(Ob%Gb z{AcbnM5)iP4?n%5SgzeOpx45Gdi&cWKKvio^cofXrpfm~*6A}o{9p8;e}&~lXh6ZG ze*+Bsj?llX$%?m+)eo3H$mM;HiB|u}^fRoF)Z4E$otYeP?N!{*xn0(A+>K1HebEp^ z&O0^TA%Ji1@}al+&?7!{dS|j+yF8-lHENLi;bzWvH`9Ymm}L5&`N(;O`OiPZjIe*Y zzQy$EZBKd$NUd+eyBKA#l&*Ni@oap4a8$oUrYHzf?Ceisz`5QlDi29ut-2Nfc&ny_a_{k*G zn;tjx_Zd(h@R9#f<`4dO7**;BHj~GyYr|BGUKmZec;vf8~A+lRhZ(#cD1BR~O#mDV(O?L?3 z+XhW9T~DoLIVZS2^*dHba{^0(a`h1Qk^is{JC^0=#jclOgNo`u_^eTI59|3RP3M3E-pBld8AH_XO5?W2 zhkq;cFQg4Wn}OQN^vH~%j~h^b!t`41IQm^*#Qizw<>n{PG5<7=7b%wWHKqp-8-gq_ zzsvL{=9m8bKQ)~bJK%y3{c3GTHLCIBhFQP!h}$NJf^Jlghe*G>#Zo^EEJZ(-=U*~{xjU-2 z>iJTQI?w+6F3xvmnbBvS4Ml(Z0CD?#GQIJ?7~x_s6HE`jVCdpkKg9I89fr=shT6yU^Pc(46PoT&oNvGC zL;qWrlNmOQ`dcBm{d*t&pZn04zk%EF2_r|pQ;FMEnr>*$zjrc!=Ie&aZe4{$4*Tsc zPSnfvAXh+=>06k-@W+Obopx4s;jqQlpRa)~)sEE6L=wI@ozvx5%A3pS-Xgc~o z8$<(V{Vz+$Ce<4>9YZA_nk$`F3T^ml1G2OMy>5B**r`UiaIpVIUi zb%NVV<}-f*I@PmxefI_CAH3HnqrV}6+vl0yv|#9W8&C^Oj~p_DIi`PG(>dUPGa`o> z#qa$S(-(OB{ROA_mB{~uA?SDTackj06Zyl&2}9T4`oQhmm_FwjN7gWX-ecGAWqM?{ zk@L?6)IFe+KR?0i2=V7zMLt)g%xA_mofA7?M&y6Q5T(5Pm_Eq!G3h^_XL{3L8va*U z)(axXQ?6Gu-H{>R{zUjcW(bn+zxeQ580d9N%gFL?6&h&dh5AwYe#x%c^V(vf6a%$OMeU@41-RW}kiaF+2-0o7Y zrNz1-Goeym46 zZzwuVMK+m>&F^A&B1?^O+ZYg!yNA z989sCKNb1xCq>R2(=+3SQS9hTOken_JKq;X{;Z)N;(3>%Z~fu}x+77#>82ysRcn?G z7mFn0?f76UVutZ;(L@$u)sJuahLXd>^ri7!9%uMu8s5DY90<~>UhS#a2tp9lj$?hI z$y_H6!AqE1ExHhP9`iF2HOFst#`UQ=I(-+NFbEMQrYk)n1bxyisiN6h^Vw9NJ{p_C zl%^G5*@K86JteY5ua88ftqYs_nx(l<(Y7B&3H1ZPl<}+@TBaN8qVTunrM7tY-|ci93dQ6l+3$W zOAfx##TYi-0VSkvL2aipRHM|uC;EoFb2*jE$8qkAJ6(PU&f;^E`t;d!EjXGE^198T+eW!<_-e7D4%dm3r#fM6 zSp^*jVyTuX-Hmm{B2ni!^2ZscDmR4F*pgYd{H@9HxSK*cqU3$!ag>8jNsYkRd+022 zoo}HYbS84vx$4aIK^3P-hZ&TVA6$#X5I&<2C zw3;*is42@-=+LOALVMB0yVG&}JOky|uB;sgfIV9sL86*l-jbnq7;|{B$Lc%4-a{2!43LOEbin`O&x#?~S zRf29VE=3}amda1$OgotF+@4J&RM{h5jTvHOK%-Olp+CA8Kt;GJ*fbsK8151 z+_F1WXO65+_tC9WXgBKnIdAo}(g)qC1*v7VtXCzDs?Z7561ND;%Pag=vkLC+Xm zm@Zo$t~giH-c&TCKmW6iKr9q!fkz91n!oX{^+ZepRAVTz(# zRd=~w7&okDXnUc{MDd~ZtwYhFi7Z+SMxGqW=EssAyU<3A=8$tg)o^ZVe8`<)bB<1( zc6;{RNON{X#nDz#XsstB3a!~`P8g;G1ThVv;VO;us=d8MYn@6YGKHb4k9w^hsVEMD zXo7?p#n3hqPyN2A(!1|aL!+3o6e-#tjU&%uFG7(ZC2YPgudAsABL~lr*trz6rb(gd z%nX_F1LkIi4>CaL(J}>!amT|@Dd$jyGA4!fIKqBulBP>;-84&X#5f$4(i1T^2gaqL z%oH3kop3Ooc1L3BQ%0zsk@Y1~!ze1sg|1@khcYuc^~ZAC?n~xJLkZZ;s2+RUve`5S zcN!e^aYpWp*ur5+3A7!`f$GOh1JmcE!;ID{Iu*$4U@2SFKrBU-*=zcEiOUqnV{9y* zfJLRJ+&T=?u}vNlT&9~Ma>Owt$1%N68sE1FC(Rn&(db->anGRGo1h~>(eQflK;_a# zVr&gJnouADT*v~8<>wA0`JZq5#}U!H@O`L(d#^On68$%FmK`G zx_PPBd1?ucq8EoR>0+2e4&B-4CZj{GihY=hquOC9aOJwgQ2*I5V0sK@<#9DEhmyNx zd6hE5P#Omnd)k*KKw}wqV$kwwM-L;jn#hrZaPyJE-HEXltPzqty=y0TrS*6BdB@xr z?WLX$$;G>bvTj2K^w|==?w$uldjn}t$#ws-gM^V0(Aq2 z5?d!Q=O%$>1#N~yOwy9WQ~e#q}Qf#?K-7iwNaP#4U~mGSl8r1)8Cw@wj^K7Dr7j;hN%s* zb=VjK&B3UcxfLgx$Vu7`Cg za-dvfFla?v8_P31dXF|hoeVFRYQDg94rP=;SD3> z6AjxF*&Noiu0RxV*+eQv3HUCP%Bu!#eGRy67)c|D-iNWo+-4SwTiF8Q6h;QlH@mw4Hy zUc&ru1LBccA-(*bUGUuXhOrV*f&Y@e&XCp3K#CLj^1F9Id9RzD7sO9N^+oA_AOF4SW1vwHxtR*T-Lm*4pd>a$6N z8HjCXeF=^rFN6$koayCxsbCjBe-VDkSHjyp>E-%M(d+7lFi2i~HL8C54Hb|GeC$auA(;r3}>U)x2-lqt2`X;UziI;q( z9k(GZX)5vZoram^Mt?f@%6Un8L3(`POka3EBWTjk;&)iiE9ea1{zd#JS(3l}UVWB+ zn2(Fa=zAS5+?ku>0OD-)&?UY6o=u{SRFj5i6qddtJcK7W|I_54ago2EXGKzO@gLN` kMR(j5u7j;6r&sY@!+A;mLZj$%>GPWm=l6t>(VPDN0M*s;IRF3v -- Gitee