diff --git a/python/akg/composite/split_stitch.py b/python/akg/composite/split_stitch.py index 477b594274fa5757758f2b7f8df8a245ff7f1653..b69ffa1fe102471657535ed34a731c2907f002f7 100644 --- a/python/akg/composite/split_stitch.py +++ b/python/akg/composite/split_stitch.py @@ -104,7 +104,9 @@ def _shared_memory_optimization(desc_d, req_map, outputs): # rule1: one buffer start larger equal to the reused buffer end. if sort_req_liveness[sort_req_buf[i]].start >= sort_req_liveness[sort_req_buf[j]].end: # rule2: sizes are compatible. - if req_map[sort_req_buf[i]][0] <= req_map[sort_req_buf[j]][0] and sort_req_buf[j] not in outputs: + if sort_req_buf[i] in outputs or sort_req_buf[j] in outputs: + continue + if req_map[sort_req_buf[i]][0] <= req_map[sort_req_buf[j]][0]: # rule3: make sure the candidate reused buffer is not using by other conflict variable. for item in reverse_reuse_map.get(sort_req_buf[j], []): if (sort_req_liveness[item].end >= sort_req_liveness[sort_req_buf[i]].end) \ diff --git a/src/poly/create_cluster.cc b/src/poly/create_cluster.cc new file mode 100644 index 0000000000000000000000000000000000000000..7f4b173938de4641e0b31165ff477cc037d6257b --- /dev/null +++ b/src/poly/create_cluster.cc @@ -0,0 +1,580 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "create_cluster.h" +#include "poly/schedule_tree_util.h" +#include "poly/scop.h" +#include "poly/dma_inject.h" +#include "poly/poly_util.h" +#include + +namespace akg { +namespace ir { +namespace poly { +std::set CreateCluster::GetAllPromotedTensor() { + std::set all_tensors; + auto RecordPromotedTensor = [&all_tensors](StmtIdHashMap tensor_map) -> void { + for (const auto &item : tensor_map) { + for (const auto &item_id : item.second) { + all_tensors.emplace(item_id.get_name()); + } + } + }; + + auto read_map = scop_info_.StmtReadMap(); + auto write_map = scop_info_.StmtWriteMap(); + RecordPromotedTensor(read_map); + RecordPromotedTensor(write_map); + return all_tensors; +} + +std::set CreateCluster::GetTempPromotedTensor(std::set all_tensors) { + auto origin_binds = scop_info_.user_config_.GetOriginBind(); + std::set orig_tensors; + + for (const auto &item : origin_binds) { + if (!item.first.defined()) continue; + auto id = isl::id(scop_info_.ctx_, item.first->op->name); + orig_tensors.insert(id.get_name()); + } + std::set temp_tensors; + std::set_difference(all_tensors.begin(), all_tensors.end(), orig_tensors.begin(), orig_tensors.end(), + std::inserter(temp_tensors, temp_tensors.begin())); + return temp_tensors; +} + +void CreateCluster::RecordInitPromotedTensorType(const std::unordered_set &configed_tensors) { + std::set all_tensors = GetAllPromotedTensor(); + std::set temp_tensors = GetTempPromotedTensor(all_tensors); + std::unordered_set not_promoted_tensors = scop_info_.analysis_result_.GetTensorsNotPromote(); + + // According to the current judgment, initialize the promoted type of all tensor. + for (auto tensor : all_tensors) { + auto id = isl::id(scop_info_.ctx_, tensor); + if (configed_tensors.find(tensor) != configed_tensors.end()) { + all_tensors_[id] = PromotedTensorType::CUSTOM; + } else if (not_promoted_tensors.find(tensor) != not_promoted_tensors.end()) { + all_tensors_[id] = PromotedTensorType::NONE; + } else if (temp_tensors.find(tensor) != temp_tensors.end()) { + all_tensors_[id] = PromotedTensorType::TEMP; + } else { + all_tensors_[id] = PromotedTensorType::OTHERS; + } + } +} + +std::vector> CreateCluster::SortPromotedTensorInfo( + const PromotedTensor &all_tensors) { + // Sort the tensor according to the promoted type + auto Compute = [](std::pair a, std::pair b) -> bool { + if (a.second == b.second) { + return a.first.get_name() < b.first.get_name(); + } + return a.second > b.second; + }; + + std::vector> tensor_list; + for (auto it = all_tensors.begin(); it != all_tensors.end(); it++) { + // If the current operator does not need to be promoted, it does not need to be sorted. + if (it->second == PromotedTensorType::NONE) { + continue; + } + tensor_list.push_back(std::pair(it->first, it->second)); + } + + std::sort(tensor_list.begin(), tensor_list.end(), Compute); + return tensor_list; +} + +// Record the final tensor that needs to be promoted. +void CreateCluster::RecordPromotedTensorInfo(const isl::schedule_node &orig_node, const std::string &mark_name, + const PromotedTensor &all_tensors) { + auto all_tensors_list = SortPromotedTensorInfo(all_tensors); + if (all_tensors.size() <= 0) { + return; + } + + isl::union_map reads = scop_info_.analysis_result_.GetReads(); + isl::union_map writes = scop_info_.analysis_result_.GetWrites(); + isl::union_map copyin = scop_info_.analysis_result_.GetCopyin(); + isl::union_map fake_copyin = scop_info_.analysis_result_.GetFakeCopyin(); + + std::vector nodes = CollectMarkNode(orig_node, mark_name); + + for (const auto &node : nodes) { + auto tree = node.parent(); + auto partial_sched = GetPartialSchedule(tree); + + for (const auto &tensor : all_tensors_list) { + auto promoted_id = tensor.first; + BufferDefInfo promoted_info = GetPromotedInfo(promoted_id, mark_name); + + promoted_info.footprints_cluster = TensorFootprintCluster::HoistBufferFootprintCluster( + partial_sched, promoted_id, reads, copyin, writes, fake_copyin); + if (promoted_info.footprints_cluster == nullptr || + !CheckPromotion(tree, orig_node, *promoted_info.footprints_cluster, tensor)) { + continue; + } + + promoted_info.footprint_cluster_map.emplace_back(std::make_pair(tree, promoted_info.footprints_cluster)); + scop_info_.analysis_result_.buffer_def_infos_.push_back(promoted_info); + } + } +} + +void CreateCluster::RecordGemmTensors() { + auto tensors = GetMatmulTensorsName(scop_info_); + auto RecordPromotedTensor = [this, &tensors](const std::string &matrix_name) -> void { + if (tensors.count(matrix_name) == 0) { + return; + } + auto id = isl::id(scop_info_.ctx_, tensors[matrix_name]); + if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::SPECIAL) { + all_tensors_[id] = PromotedTensorType::SPECIAL; + } + }; + + RecordPromotedTensor(MATRIX_A); + RecordPromotedTensor(MATRIX_B); + RecordPromotedTensor(MATRIX_C); +} + +PromotedTensor CreateCluster::GetCurrentMarkerTensors(const bool hoist_tensor_c) { + PromotedTensor current_tensors; + for (auto &tensor : all_tensors_) { + auto id_name = tensor.first.get_name(); + auto tensor_mark = GetTensorMark(id_name, scop_info_); + // Only promote tensor A/B at the position marked A/B. + bool is_tensor_ab = !hoist_tensor_c && (tensor_mark == TENSOR_A || tensor_mark == TENSOR_B); + // Only promote tensor C at the position marked C. + bool is_tensor_c = hoist_tensor_c && tensor_mark == TENSOR_C; + if (is_tensor_c || is_tensor_ab) { + current_tensors.insert(tensor); + } + } + + return current_tensors; +} + +/********************************************* + * Shared Create Cluster + *********************************************/ +bool SharedCreateCluster::CoalescingAccessWay(const isl::schedule_node &node, const isl::schedule_node &root, + const TensorFootprintCluster &cluster) { + isl::union_map original = cluster.OrigianlAccessRelations(); + size_t tensor_dim = cluster.foot_print_.GetBoxDim(); + std::vector thread_marker = CollectFnNode(IsThreadMappedMark, root); + for (auto item : thread_marker) { + if (!(item.isa()) && !(item.has_children()) && + !(item.child(0).isa())) { + continue; + } + isl::schedule_node thread_filter = item.child(0); + if (!thread_filter.has_children()) { + continue; + } + isl::schedule_node thread_band = thread_filter.child(0); + if (!thread_band.has_children()) { + continue; + } + isl::schedule_node inner_band = thread_band.child(0); + size_t num_mapped_thread = inner_band.schedule_depth() - thread_band.schedule_depth(); + if (num_mapped_thread == 0) { + continue; + } + size_t inner_depth = inner_band.schedule_depth(); + auto active_domains = CollectDomain(thread_band); + auto local_access = original.intersect_domain(active_domains); + auto schedule = ShortSchedule(inner_band); + auto schedule_access = local_access.apply_domain(schedule); + for (auto access : schedule_access.get_map_list()) { + if (!IsSubsetForIncreaseDim(access, tensor_dim - 1, inner_depth - 1)) { + return true; + } + } + } + return false; +} + +// Determine whether the current tensor needs to be promoted. +bool SharedCreateCluster::CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) { + if (tensor_info.second > PromotedTensorType::TEMP) { + return true; + } + auto partial_sched_mupa = ShortScheduleMupa(current_node.root(), current_node); + auto coalesced_access = scop_info_.analysis_result_.GetOuterBandNode(band_index_)->coalesced_access_tensors; + auto tensor_name = tensor_info.first.get_name(); + if (CoalescingAccessWay(current_node, node, cluster) || + coalesced_access.find(tensor_name) != coalesced_access.end()) { + return true; + } + return false; +} + +isl::union_map SharedCreateCluster::GetPartialSchedule(const isl::schedule_node &node) { + auto root_node = node.root(); + CHECK(!IsAncestorMapToThread(node)) << "shared memory promotion cannot below thread_marker."; + auto block_cfg = scop_info_.user_config_.GetBlockConfig(); + CHECK(block_cfg != nullptr) << "block config is null"; + auto replace_cfg = scop_info_.user_config_.GetReplaceConfig(); + MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(band_index_); + std::unordered_set non_repeated_idx = GetNonRepeatedIdx(mapping_strategy); + auto mapping_filter_info = GetMappingFilterInfo(root_node, block_cfg, replace_cfg, non_repeated_idx); + + auto partial_sched = LocalSchedule(node); + if (!mapping_filter_info.is_empty()) { + partial_sched = partial_sched.intersect_domain(mapping_filter_info); + } + return partial_sched; +} + +BufferDefInfo SharedCreateCluster::GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) { + GpuMemType gpu_mem_type = GpuMemType::SHARED; + MemType mem_type = MemType::SHARED_; + + isl::id dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id); + if (scop_info_.IsCopyinTensor(promoted_id.get_name()) && band_index_ != 0) { + dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id, band_index_); + } + std::vector buffer_sizes; + std::vector> data_stream; + data_stream.push_back(std::make_pair(promoted_id, MemType::DDR)); + data_stream.push_back(std::make_pair(promoted_id, mem_type)); + BufferDefInfo promoted_info = BufferDefInfo{promoted_id, + dst_tensor_id, + promoted_id, + MemType::DDR, + mark_name, + false, + false, + data_stream, + Tensor(), + Handle(), + buffer_sizes, + nullptr, + isl::union_map::empty(isl::space(scop_info_.ctx_, 0))}; + + return promoted_info; +} + +void SharedCreateCluster::CreateClusterListForGemm(const isl::schedule_node &node, + const std::unordered_set &mark_names) { + auto configed_tensors = scop_info_.user_config_.GetSharedTensors(); + // Initialize the promoted types of all tensors. + RecordInitPromotedTensorType(configed_tensors); + // Modify promoted type of tensor A/B/C for gemm operator. + RecordGemmTensors(); + + for (const auto &mark_name : mark_names) { + bool hoist_tensor_c = mark_name == PROMOTE_GLOBAL_TO_SHARED_C; + // Promote the specific tensor at the corresponding marker position. + PromotedTensor current_tensors = GetCurrentMarkerTensors(hoist_tensor_c); + RecordPromotedTensorInfo(node, mark_name, current_tensors); + } +} + +void SharedCreateCluster::CreateClusterListForElementWise(const isl::schedule_node &node, + const std::unordered_set &mark_names) { + auto configed_tensors = scop_info_.user_config_.GetSharedTensors(); + // Initialize the promoted types of all tensors. + RecordInitPromotedTensorType(configed_tensors); + for (const auto &mark_name : mark_names) { + RecordPromotedTensorInfo(node, mark_name, all_tensors_); + } +} + +void SharedCreateCluster::CreateClusterListForReduce(const isl::schedule_node &node, + const std::unordered_set &mark_names) { + auto configed_tensors = scop_info_.user_config_.GetSharedTensors(); + // Initialize the promoted types of all tensors. + RecordInitPromotedTensorType(configed_tensors); + // Modify promoted type of the returned tensor for reduce operator. + RecordReduceTensors(); + + for (const auto &mark_name : mark_names) { + RecordPromotedTensorInfo(node, mark_name, all_tensors_); + } +} + +void SharedCreateCluster::RecordReduceTensors() { + // In order to enable cuda atomic operator, add these tensors for shared memory promotion list + auto atomic_tensors = scop_info_.analysis_result_.GetAtomicTensors(); + if (!atomic_tensors.empty()) { + for (const auto &item : atomic_tensors) { + auto id = isl::id(scop_info_.ctx_, item.tensor_name); + if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::SPECIAL) { + all_tensors_[id] = PromotedTensorType::SPECIAL; + } + } + } + + // For the condition that it is without cuda atomic usage, but with reduce operation. + // Also need to add these tensors for shared memory promotion list. + auto reduce_out_tensors = scop_info_.analysis_result_.GetReduceTensorInfoMap(); + for (const auto &item : reduce_out_tensors) { + auto id = isl::id(scop_info_.ctx_, item.second.write_tensor_name); + if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::SPECIAL) { + all_tensors_[id] = PromotedTensorType::SPECIAL; + } + } + + // For the reduce operator, only the return tensor and the temp tensor can be promoted. For ordinary tensor, it will + // cause an error in the reduce interface after promotion. + for (auto &tensor : all_tensors_) { + if (tensor.second == PromotedTensorType::OTHERS) { + all_tensors_[tensor.first] = PromotedTensorType::NONE; + } + } +} + +/********************************************* + * Register Create Cluster + *********************************************/ +isl::union_map RegisterCreateCluster::GetPartialSchedule(const isl::schedule_node &node) { + auto root_node = node.root(); + auto block_cfg = scop_info_.user_config_.GetBlockConfig(); + CHECK(block_cfg != nullptr) << "block config is null"; + auto replace_cfg = scop_info_.user_config_.GetReplaceConfig(); + MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(band_index_); + std::unordered_set non_repeated_idx = GetNonRepeatedIdx(mapping_strategy); + auto block_mapping = GetMappingFilterInfo(root_node, block_cfg, replace_cfg, non_repeated_idx); + + auto thread_cfg = scop_info_.user_config_.GetThreadConfig(); + CHECK(thread_cfg != nullptr) << "thread config is null"; + auto thread_mapping = isl::union_set::empty(block_mapping.ctx()); + mapping_strategy = scop_info_.user_config_.GetInnerMappingStrategy(band_index_); + non_repeated_idx = GetNonRepeatedIdx(mapping_strategy); + thread_mapping = GetMappingFilterInfo(root_node, thread_cfg, replace_cfg, non_repeated_idx); + + auto partial_sched = LocalSchedule(node); + if (!thread_mapping.is_empty() && !block_mapping.is_empty()) { + auto mapping = block_mapping.intersect(thread_mapping); + partial_sched = partial_sched.intersect_domain(mapping); + } else if (!thread_mapping.is_empty()) { + partial_sched = partial_sched.intersect_domain(thread_mapping); + } else if (!block_mapping.is_empty()) { + partial_sched = partial_sched.intersect_domain(block_mapping); + } + return partial_sched; +} + +// Check if the given "group" can be promoted to registers for the given mapping to thread identifiers and within the +// given outer schedule. +bool RegisterCreateCluster::IsPromote(const TensorFootprintCluster &fp_cluster, + const isl::multi_union_pw_aff &partial_sched_mupa, + const isl::multi_union_pw_aff &thread_schedule) { + // compute the mapping relation between single thread and outer schedule space and tensor elements pair + isl::union_map state_schedule_mapping = + ScheduleTensorMapping(partial_sched_mupa, fp_cluster.OrigianlAccessRelations()); + isl::union_map thread_schedule_mapping = state_schedule_mapping.apply_domain(isl::union_map::from(thread_schedule)); + // check that whether the mapping relation between single thread and outer schedule points and group elements pair + // is injective. + return thread_schedule_mapping.is_injective(); +} + +// Determine whether the current tensor needs to be promoted. +bool RegisterCreateCluster::CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) { + if (tensor_info.second > PromotedTensorType::OTHERS) { + return true; + } + isl::schedule_node root_node = current_node.get_schedule().get_root(); + auto thread_cfg = scop_info_.user_config_.GetThreadConfig(); + CHECK(thread_cfg != nullptr) << "thread config is null"; + auto block_cfg = scop_info_.user_config_.GetBlockConfig(); + CHECK(block_cfg != nullptr) << "block config is null"; + + auto thread_schedule = MapDomainAllWithType(root_node, thread_cfg, scop_info_.upa_node_mapping_, THREAD_MARKER); + auto block_schedule = MapDomainAllWithType(root_node, block_cfg, scop_info_.upa_node_mapping_, BLOCK_MARKER); + auto tmp_node = current_node; + if (current_node.isa()) { + tmp_node = current_node.child(0); + } + + auto partial_sched_mupa = ShortScheduleMupa(root_node, tmp_node); + partial_sched_mupa = partial_sched_mupa.flat_range_product(block_schedule).flat_range_product(thread_schedule); + if (IsPromote(cluster, partial_sched_mupa, thread_schedule)) { + return true; + } + + return false; +} + +BufferDefInfo RegisterCreateCluster::GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) { + isl::id dst_tensor_id = GetGpuIndexDstId(GpuMemType::LOCAL, promoted_id); + if (scop_info_.IsCopyinTensor(promoted_id.get_name()) && band_index_ != 0) { + dst_tensor_id = GetGpuIndexDstId(GpuMemType::LOCAL, promoted_id, band_index_); + } + + std::vector buffer_sizes; + std::vector> data_stream; + MemType memtype; + isl::id tmp_item; + if (!shared_tensor_.count(promoted_id.get_name() + SHARE_SUFFIX)) { + tmp_item = promoted_id; + data_stream.push_back(std::make_pair(promoted_id, MemType::DDR)); + data_stream.push_back(std::make_pair(promoted_id, MemType::LOCAL_)); + memtype = MemType::DDR; + } else { + tmp_item = isl::id(scop_info_.ctx_, promoted_id.get_name() + SHARE_SUFFIX); + data_stream.push_back(std::make_pair(promoted_id, MemType::SHARED_)); + data_stream.push_back(std::make_pair(promoted_id, MemType::LOCAL_)); + memtype = MemType::SHARED_; + } + BufferDefInfo promoted_info = BufferDefInfo{tmp_item, + dst_tensor_id, + tmp_item, + memtype, + mark_name, + false, + false, + data_stream, + Tensor(), + Handle(), + buffer_sizes, + nullptr, + isl::union_map::empty(isl::space(scop_info_.ctx_, 0))}; + + return promoted_info; +} + +// Operators that have been promoted to the shared memory do not need to be promoted to the register memory in general. +// Except for gemm operators. +void RegisterCreateCluster::RecordSharedPromotedTensors(const bool is_gemm) { + for (auto buffer : scop_info_.analysis_result_.active_buffer_footprints_) { + shared_tensor_.insert(buffer.second.cluster_id.get_name()); + } + + if (is_gemm) { + return; + } + + std::string shared_suffix = SHARE_SUFFIX; + for (const auto &item : shared_tensor_) { + auto id = isl::id(scop_info_.ctx_, item.substr(0, item.length() - shared_suffix.size())); + if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::NONE) { + all_tensors_[id] = PromotedTensorType::NONE; + } + } +} + +void RegisterCreateCluster::CreateClusterListForGemm(const isl::schedule_node &node, + const std::unordered_set &mark_names) { + auto configed_tensors = scop_info_.user_config_.GetRegisterTensors(); + // Initialize the promoted types of all tensors. + RecordInitPromotedTensorType(configed_tensors); + // Statistics shared_tensor_ information. + RecordSharedPromotedTensors(true); + // Modify promoted type of tensor A/B/C for gemm operator. + RecordGemmTensors(); + + for (const auto &mark_name : mark_names) { + bool hoist_tensor_c = ((mark_name == PROMOTE_GLOBAL_TO_REGISTER_C) || (mark_name == PROMOTE_SHARED_TO_REGISTER_C)); + // Promote the specific tensor at the corresponding marker position. + PromotedTensor current_tensors = GetCurrentMarkerTensors(hoist_tensor_c); + RecordPromotedTensorInfo(node, mark_name, current_tensors); + } +} + +void RegisterCreateCluster::CreateClusterListForElementWise(const isl::schedule_node &node, + const std::unordered_set &mark_names) { + auto configed_tensors = scop_info_.user_config_.GetRegisterTensors(); + bool is_enable_vectorization = scop_info_.analysis_result_.GetOuterBandNode(band_index_)->enable_vectorization; + // Initialize the promoted types of all tensors. + RecordInitPromotedTensorType(configed_tensors); + // Delete the tensor that has been promoted on shared memory. + RecordSharedPromotedTensors(); + // Add the tensor that needs to be vectorized. + RecordVectorizedPromotedTensors(is_enable_vectorization); + + for (const auto &mark_name : mark_names) { + RecordPromotedTensorInfo(node, mark_name, all_tensors_); + } +} + +void RegisterCreateCluster::RecordVectorizedPromotedTensors(const bool is_enable_vectorization) { + for (auto tensor : all_tensors_) { + if (tensor.second > PromotedTensorType::OTHERS) { + continue; + } + + if (is_enable_vectorization) { + all_tensors_[tensor.first] = PromotedTensorType::SPECIAL; + } else { + all_tensors_[tensor.first] = PromotedTensorType::NONE; + } + } +} +/********************************************* + * Cpu Create Cluster + *********************************************/ +BufferDefInfo CpuCreateCluster::GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) { + GpuMemType gpu_mem_type = GpuMemType::LOCAL; + MemType mem_type = MemType::LOCAL_; + + isl::id dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id); + if (scop_info_.IsCopyinTensor(promoted_id.get_name()) && band_index_ != 0) { + dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id, band_index_); + } + std::vector buffer_sizes; + std::vector> data_stream; + data_stream.push_back(std::make_pair(promoted_id, MemType::DDR)); + data_stream.push_back(std::make_pair(promoted_id, mem_type)); + BufferDefInfo promoted_info = BufferDefInfo{promoted_id, + dst_tensor_id, + promoted_id, + MemType::DDR, + mark_name, + false, + false, + data_stream, + Tensor(), + Handle(), + buffer_sizes, + nullptr, + isl::union_map::empty(isl::space(scop_info_.ctx_, 0))}; + + return promoted_info; +} + +isl::union_map CpuCreateCluster::GetPartialSchedule(const isl::schedule_node &node) { return LocalSchedule(node); } + +// Determine whether the current tensor needs to be promoted. +bool CpuCreateCluster::CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) { + auto template_type = scop_info_.analysis_result_.GetOuterBandNode(band_index_)->template_type; + return template_type == Template::MATMUL && scop_info_.user_config_.GetEnableMatmul(); +} + +void CpuCreateCluster::CreateClusterListForGemm(const isl::schedule_node &node, + const std::unordered_set &mark_names) { + auto configed_tensors = scop_info_.user_config_.GetRegisterTensors(); + // Initialize the promoted types of all tensors. + RecordInitPromotedTensorType(configed_tensors); + + for (auto mark_name : mark_names) { + // Promote the specific tensor at the corresponding marker position. + PromotedTensor current_tensors = GetCurrentMarkerTensors(false); + RecordPromotedTensorInfo(node, mark_name, current_tensors); + } +} +} // namespace poly +} // namespace ir +} // namespace akg diff --git a/src/poly/create_cluster.h b/src/poly/create_cluster.h new file mode 100644 index 0000000000000000000000000000000000000000..607540a747b3670b1b110d37f7965badbe66feab --- /dev/null +++ b/src/poly/create_cluster.h @@ -0,0 +1,141 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef POLY_CREATE_CLUSTER_H_ +#define POLY_CREATE_CLUSTER_H_ + +#include "poly/schedule_pass.h" + +namespace akg { +namespace ir { +namespace poly { +// tensor priority: custom > none > special > temp > others +// OTHERS:Whether the tensor is promoted or not has no effect on functionality, only on performance +// TEMP: the temp tensor must be promoted +// SPECIAL: reduce, gemm, conv, etc. operators must be promoted +// NONE: the tensor does not need to be promoted +// CUSTOM: the custom tensor must be promoted +enum class PromotedTensorType { OTHERS = 0, TEMP, SPECIAL, NONE, CUSTOM }; +using PromotedTensor = std::unordered_map; + +class CreateCluster { + public: + explicit CreateCluster(ScopInfo &scop_info, int band_index) : scop_info_(scop_info), band_index_(band_index) {} + ~CreateCluster() {} + + protected: + // Record the tensor that needs to be promoted. + std::set GetAllPromotedTensor(); + std::set GetTempPromotedTensor(std::set all_tensors); + void RecordInitPromotedTensorType(const std::unordered_set &configed_tensors); + + // Sort all tensors by their priority + std::vector> SortPromotedTensorInfo(const PromotedTensor &all_tensors); + + // Record the final tensor that needs to be promoted. + void RecordPromotedTensorInfo(const isl::schedule_node &orig_node, const std::string &mark_name, + const PromotedTensor &all_tensors); + + // Common functions required by shared, register in gpu and cpu. + virtual isl::union_map GetPartialSchedule(const isl::schedule_node &node) = 0; + virtual BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) = 0; + virtual bool CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) = 0; + + // gemm operator + void RecordGemmTensors(); + PromotedTensor GetCurrentMarkerTensors(const bool hoist_tensor_c); + + ScopInfo &scop_info_; + PromotedTensor all_tensors_; + int band_index_; +}; + +class SharedCreateCluster : public CreateCluster { + public: + explicit SharedCreateCluster(ScopInfo &scop_info, int band_index) : CreateCluster(scop_info, band_index) {} + ~SharedCreateCluster() {} + + // Promoted tensors needed to create different types of operators. + void CreateClusterListForGemm(const isl::schedule_node &orig_node, const std::unordered_set &mark_names); + void CreateClusterListForReduce(const isl::schedule_node &orig_node, + const std::unordered_set &mark_names); + void CreateClusterListForElementWise(const isl::schedule_node &orig_node, + const std::unordered_set &mark_names); + + private: + bool CoalescingAccessWay(const isl::schedule_node &node, const isl::schedule_node &root, + const TensorFootprintCluster &cluster); + + // Common functions required by shared, register in gpu and cpu. + bool CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) override; + isl::union_map GetPartialSchedule(const isl::schedule_node &node) override; + BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) override; + + void RecordReduceTensors(); +}; + +class RegisterCreateCluster : public CreateCluster { + public: + explicit RegisterCreateCluster(ScopInfo &scop_info, int band_index) : CreateCluster(scop_info, band_index) {} + ~RegisterCreateCluster() {} + + // Promoted tensors needed to create different types of operators. + void CreateClusterListForGemm(const isl::schedule_node &orig_node, const std::unordered_set &mark_names); + void CreateClusterListForElementWise(const isl::schedule_node &orig_node, + const std::unordered_set &mark_names); + + isl::union_map GetPartialSchedule(const isl::schedule_node &node) override; + + private: + void RecordSharedPromotedTensors(const bool is_gemm = false); + bool IsPromote(const TensorFootprintCluster &fp_cluster, const isl::multi_union_pw_aff &partial_sched_mupa, + const isl::multi_union_pw_aff &thread_schedule); + + // Common functions required by shared, register in gpu and cpu. + bool CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) override; + BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) override; + + void RecordVectorizedPromotedTensors(const bool is_enable_vectorization); + + std::set shared_tensor_; +}; + +class CpuCreateCluster : public CreateCluster { + public: + explicit CpuCreateCluster(ScopInfo &scop_info, int band_index) : CreateCluster(scop_info, band_index) {} + ~CpuCreateCluster() {} + // Promoted tensors needed to create different types of operators. + void CreateClusterListForGemm(const isl::schedule_node &orig_node, const std::unordered_set &mark_names); + + private: + // Common functions required by shared, register in gpu and cpu. + isl::union_map GetPartialSchedule(const isl::schedule_node &node) override; + BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) override; + bool CheckPromotion(const isl::schedule_node ¤t_node, const isl::schedule_node &node, + const TensorFootprintCluster &cluster, + const std::pair &tensor_info) override; +}; +} // namespace poly +} // namespace ir +} // namespace akg + +#endif // POLY_CREATE_CLUSTER_H_ \ No newline at end of file diff --git a/src/poly/gpu_emit/gpu_isl_emitter.cc b/src/poly/gpu_emit/gpu_isl_emitter.cc index 93d2f718a611a52af198744024056d80492fc2ef..d637168b2ede9e9553b06a5d62d5b4702e212b8e 100644 --- a/src/poly/gpu_emit/gpu_isl_emitter.cc +++ b/src/poly/gpu_emit/gpu_isl_emitter.cc @@ -256,7 +256,6 @@ Stmt GpuIslEmitter::EmitTensorOfTensorStmt(const Stmt &s) { } void GpuIslEmitter::UpdateGpuIndexDtype() { - auto read_map = info_.StmtReadMap(); auto write_map = info_.StmtWriteMap(); std::set id_sets; @@ -289,12 +288,9 @@ void GpuIslEmitter::UpdateGpuIndexDtype() { } if (use_int64_idx_gpu) { - iter_name_map_ = {{B0, VarExpr(BLOCK_IDX_X, Int(64))}, - {B1, VarExpr(BLOCK_IDX_Y, Int(64))}, - {B2, VarExpr(BLOCK_IDX_Z, Int(64))}, - {T0, VarExpr(THREAD_IDX_X, Int(64))}, - {T1, VarExpr(THREAD_IDX_Y, Int(64))}, - {T2, VarExpr(THREAD_IDX_Z, Int(64))}}; + iter_name_map_ = {{B0, VarExpr(BLOCK_IDX_X, Int(64))}, {B1, VarExpr(BLOCK_IDX_Y, Int(64))}, + {B2, VarExpr(BLOCK_IDX_Z, Int(64))}, {T0, VarExpr(THREAD_IDX_X, Int(64))}, + {T1, VarExpr(THREAD_IDX_Y, Int(64))}, {T2, VarExpr(THREAD_IDX_Z, Int(64))}}; } } @@ -321,7 +317,6 @@ class InitStmtInsertSync : public IRMutator { }; Stmt GpuIslEmitter::Emit(const isl::ast_node &node) { - UpdateGpuIndexDtype(); Stmt stmt = EmitAst(node); @@ -404,20 +399,11 @@ Stmt GpuIslEmitter::EmitRealizeForGlobalTensor(Stmt stmt) { Stmt GpuIslEmitter::EmitMark(const isl::ast_node_mark &node) { std::string mark = node.get_id().get_name(); - // add for prefetch pass - if (mark == PROMOTE_GLOBAL_TO_SHARED_AB) { - Stmt stmt = EmitAst(node.get_node()); - if (!stmt.defined()) { - return Stmt(); - } - return AttrStmt::make(Expr("INFO"), SHARED_MEM_PROMOTED_COMPLETE, StringImm::make(SHARED_MEM_PROMOTED_COMPLETE), - stmt); - } - Stmt stmt; if ((mark == PROMOTE_VECTORIZATION) || (mark == PROMOTE_REGISTER_TO_GLOBAL) || (mark == PROMOTE_REGISTER_TO_SHARED) || - (mark == PROMOTE_SHARED_TO_GLOBAL) || IsStartsWith(mark, REDUCE_ATOMIC_FLAG)) { + (mark == PROMOTE_SHARED_TO_GLOBAL) || (mark == SHARED_MEM_PROMOTED_COMPLETE) || + IsStartsWith(mark, REDUCE_ATOMIC_FLAG)) { stmt = EmitAst(node.get_node()); if (!stmt.defined()) { return Stmt(); diff --git a/src/poly/gpu_emit/gpu_isl_emitter.h b/src/poly/gpu_emit/gpu_isl_emitter.h index b621637f6b6efe355f2e804b88e421c9568435d5..ede956f152e6e7ce17a22ec0d23debb87d95945b 100644 --- a/src/poly/gpu_emit/gpu_isl_emitter.h +++ b/src/poly/gpu_emit/gpu_isl_emitter.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,6 @@ constexpr auto MIND_TRICKS_SWIZZLE_PRAGMA = "pragma_swizzle_kernel"; // add for one dimension mapping constexpr auto ORIGIN_THREAD_DIM_X = "bind_thread_x"; -constexpr auto SHARED_MEM_PROMOTED_COMPLETE = "shared_mem_promoted_complete"; // example: // atomic_SumOp @@ -62,12 +61,9 @@ class GpuIslEmitter : public IslEmitter { virtual Stmt SubstituteTensorStmt(const Stmt &s, Tensor origin, Tensor replaced); virtual Stmt EmitTensorOfTensorStmt(const Stmt &s); void UpdateGpuIndexDtype(); - std::map iter_name_map_{{B0, VarExpr(BLOCK_IDX_X, Int(32))}, - {B1, VarExpr(BLOCK_IDX_Y, Int(32))}, - {B2, VarExpr(BLOCK_IDX_Z, Int(32))}, - {T0, VarExpr(THREAD_IDX_X, Int(32))}, - {T1, VarExpr(THREAD_IDX_Y, Int(32))}, - {T2, VarExpr(THREAD_IDX_Z, Int(32))}}; + std::map iter_name_map_{ + {B0, VarExpr(BLOCK_IDX_X, Int(32))}, {B1, VarExpr(BLOCK_IDX_Y, Int(32))}, {B2, VarExpr(BLOCK_IDX_Z, Int(32))}, + {T0, VarExpr(THREAD_IDX_X, Int(32))}, {T1, VarExpr(THREAD_IDX_Y, Int(32))}, {T2, VarExpr(THREAD_IDX_Z, Int(32))}}; private: // override emitters for GPU diff --git a/src/poly/poly_util.h b/src/poly/poly_util.h index 0734391e1d3022dcfb91917cdd910bc0df69085a..13be1ebba52902dbf2ab242e77f5e7baeeb18882 100644 --- a/src/poly/poly_util.h +++ b/src/poly/poly_util.h @@ -468,6 +468,7 @@ constexpr auto FOR_VECTORIZED = "for_vectorized"; constexpr auto FOR_UNROLLED = "for_unrolled"; constexpr auto FOR_SWIZZLED = "for_swizzled"; +constexpr auto SHARED_MEM_PROMOTED_COMPLETE = "shared_mem_promoted_complete"; constexpr auto PROMOTE_VECTORIZATION = "promote_vectorization"; constexpr auto PROMOTE_VECTORIZATION_BIT = 128; constexpr auto THREAD_MARKER = "thread_marker"; diff --git a/src/poly/schedule_analysis/band_node_analysis.cc b/src/poly/schedule_analysis/band_node_analysis.cc index a7a9b568637da29e07798dab692efeca56c3f3f1..e774e24b5185e3321f1bfd8e4e28ddc3685a9468 100644 --- a/src/poly/schedule_analysis/band_node_analysis.cc +++ b/src/poly/schedule_analysis/band_node_analysis.cc @@ -533,7 +533,7 @@ class OperatorInfoCollector { if (tensor_c_type == Float(16) && enable_tensor_core) { std::string shared_tensors = tensor_a_name + " " + tensor_b_name + " " + tensor_c_name; - scop_info_.user_config_.SetSharedTensors(shared_tensors); + scop_info_.user_config_.RecordSharedTensors(shared_tensors); } return true; @@ -610,7 +610,7 @@ void AnalyzeBandNode::AnalyzeAxisPosition() { if (target_ == TARGET_CPU) { last_axis = GetVectorizationAxisForCpu(bn); } else { - last_axis = GetCoalescedAccessAxisForCuda(bn->node); + last_axis = GetCoalescedAccessAxisForCuda(bn); } bn->last_axis = last_axis; } @@ -638,7 +638,7 @@ int AnalyzeBandNode::GetVectorizationAxisForCpu(std::unique_ptr & } // For the tensor of tensor operator, confirm whether coalesced access is required in the calculation phase. -int AnalyzeBandNode::GetCoalescedAccessAxisForCuda(const isl::schedule_node &orig_node) { +int AnalyzeBandNode::GetCoalescedAccessAxisForCuda(std::unique_ptr &bn) { int coalesced_access_axis = -1; if (scop_info_.user_config_.GetEnableMatmul()) { return coalesced_access_axis; @@ -647,7 +647,8 @@ int AnalyzeBandNode::GetCoalescedAccessAxisForCuda(const isl::schedule_node &ori for (auto inner_tensor : scop_info_.analysis_result_.GetInnerTensor()) { skip_tensors.emplace(inner_tensor); } - coalesced_access_axis = GetLastAxisPos(orig_node, skip_tensors); + coalesced_access_axis = GetLastAxisPos(bn->node, skip_tensors); + RecordAllCoalescedAccessTensors(bn, skip_tensors); return coalesced_access_axis; } @@ -676,6 +677,26 @@ int AnalyzeBandNode::GetLastAxisPos(const isl::schedule_node &orig_node, std::un return -1; } +void AnalyzeBandNode::RecordAllCoalescedAccessTensors(std::unique_ptr &bn, + std::unordered_set skip_tensors) { + if (!bn->node.isa()) { + return; + } + + // Get read and write tensor information. + auto reads_access = scop_info_.analysis_result_.GetReads().domain_factor_domain(); + reads_access.foreach_map([this, &bn, skip_tensors](const isl::map &map) -> void { + auto node = bn->node; + int band_number = static_cast(node.as().n_member()); + std::string id_name = map.range().get_tuple_name(); + auto access = isl::union_map(map); + int last_axis = GetLastAxis(node, access, skip_tensors); + if (last_axis != -1 && last_axis < band_number - 1) { + bn->coalesced_access_tensors.emplace(id_name); + } + }); +} + void AnalyzeBandNode::CollectStmtInfo() { auto prov_entry = scop_info_.analysis_result_.GetProvideAnalysis(); auto provides = scop_info_.analysis_result_.GetStatementMap(); diff --git a/src/poly/schedule_analysis/band_node_analysis.h b/src/poly/schedule_analysis/band_node_analysis.h index 92e28327a8c9bfedd9708cadb09372085013b899..629d7b4f0666bdce4db25f005a029b9b0bbe277a 100644 --- a/src/poly/schedule_analysis/band_node_analysis.h +++ b/src/poly/schedule_analysis/band_node_analysis.h @@ -42,8 +42,10 @@ class AnalyzeBandNode { void DetermineTemplateOfBand(std::unique_ptr &bn); bool IsGemmTempleteInBand(std::unique_ptr &bn); int GetVectorizationAxisForCpu(std::unique_ptr &bn); - int GetCoalescedAccessAxisForCuda(const isl::schedule_node &orig_node); + int GetCoalescedAccessAxisForCuda(std::unique_ptr &bn); int GetLastAxisPos(const isl::schedule_node &orig_node, std::unordered_set skip_tensors = {}); + void RecordAllCoalescedAccessTensors(std::unique_ptr &bn, + std::unordered_set skip_tensors = {}); std::string target_; const isl::schedule &sch_; diff --git a/src/poly/schedule_analysis/gpu_dma_analysis.h b/src/poly/schedule_analysis/gpu_dma_analysis.h index 66ca46053548ed357cb0d6dc22b36c49d1e14b39..000a8aabdbb0a6088034bcfb03e1895589983e70 100644 --- a/src/poly/schedule_analysis/gpu_dma_analysis.h +++ b/src/poly/schedule_analysis/gpu_dma_analysis.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,9 +27,7 @@ using TensorSets = std::unordered_set; class GpuDmaAnalysis { public: explicit GpuDmaAnalysis(const isl::schedule &sch, ScopInfo &scop_info) : sch_(sch), scop_info_(scop_info) { - if (!scop_info.user_config_.GetSharedTensors().empty()) { - configed_share_tensors_ = Split(scop_info.user_config_.GetSharedTensors(), " "); - } + configed_share_tensors_ = scop_info.user_config_.GetSharedTensors(); }; ~GpuDmaAnalysis() {} @@ -48,7 +46,7 @@ class GpuDmaAnalysis { private: const isl::schedule &sch_; ScopInfo &scop_info_; - std::vector configed_share_tensors_; + std::unordered_set configed_share_tensors_; const int MAX_STRIDE = 65535; }; diff --git a/src/poly/schedule_pass/tile_outer_band.cc b/src/poly/schedule_pass/tile_outer_band.cc index e25016d72c41f65ea6fbc511efd25aef30a17812..062c0a94d423b85d053bb90e9e1d26e6941e6bd1 100644 --- a/src/poly/schedule_pass/tile_outer_band.cc +++ b/src/poly/schedule_pass/tile_outer_band.cc @@ -1092,6 +1092,7 @@ isl::schedule_node TileOuterBand::MarkOuterPermutableCuda(isl::schedule_node nod // vectorize for elementwise operator if (scop_info_.analysis_result_.GetOuterBandNode(cur_band_index_)->enable_vectorization) { node = SetTileSizeAndTile(node, TILE_WITH_C0); + node = node.child(0).insert_mark(PROMOTE_GLOBAL_TO_REGISTER); } node = node.ancestor(node.get_tree_depth() - start_depth); @@ -1261,7 +1262,6 @@ isl::schedule_node TileOuterBand::InsertPromoteMarker(const isl::schedule_node n // Add different promotion marks in different positions. if (is_matrixc_promote_shared) { tile_node = tile_node.insert_mark(isl::id(tile_node.ctx(), PROMOTE_GLOBAL_TO_SHARED_C)).child(0); - tile_node = tile_node.insert_mark(isl::id(tile_node.ctx(), PROMOTE_SHARED_TO_REGISTER_C)).child(0); } else { tile_node = tile_node.insert_mark(isl::id(tile_node.ctx(), PROMOTE_GLOBAL_TO_REGISTER_C)).child(0); } @@ -1271,21 +1271,16 @@ isl::schedule_node TileOuterBand::InsertPromoteMarker(const isl::schedule_node n } bool TileOuterBand::IsMatrixCPromoteToShared() { - std::string shared_tensors = scop_info_.user_config_.GetSharedTensors(); + std::unordered_set shared_tensors = scop_info_.user_config_.GetSharedTensors(); if (shared_tensors.empty()) { return false; } - shared_tensors += " "; - auto pos = shared_tensors.find(" "); - while (pos != std::string::npos) { - std::string tensor = shared_tensors.substr(0, pos); + for (const auto &tensor : shared_tensors) { auto matmul_map = scop_info_.analysis_result_.GetMatrixMatmulMap(); if (matmul_map.count(tensor) && (matmul_map[tensor] == MATRIX_C || matmul_map[tensor] == MATRIX_ELSE)) { return true; } - shared_tensors = shared_tensors.substr(pos + 1, shared_tensors.size()); - pos = shared_tensors.find(" "); } return false; } diff --git a/src/poly/schedule_pass_cpu/cpu_memory_manager.cc b/src/poly/schedule_pass_cpu/cpu_memory_manager.cc index a1359872afe3db0b4a565a79dc704395bfe32d9d..d9c210d000b6c4f0eed31aaaf8e20425257fe848 100644 --- a/src/poly/schedule_pass_cpu/cpu_memory_manager.cc +++ b/src/poly/schedule_pass_cpu/cpu_memory_manager.cc @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "poly/schedule_pass_gpu/operator_shared_strategy.h" +#include "poly/create_cluster.h" #include "cpu_memory_manager.h" #include "poly/schedule_tree_util.h" #include "poly/scop.h" @@ -59,8 +59,8 @@ isl::schedule CpuMemoryManager::HoistCpuMemory() { } mark_names_ = {PROMOTE_GLOBAL_TO_REGISTER}; - CpuMemoryStrategy other_op(scop_info_, mark_names_, band_index_); - other_op.CreateClusterList(orig_node); + CpuCreateCluster create_cluster(scop_info_, band_index_); + create_cluster.CreateClusterListForGemm(orig_node, mark_names_); auto node = orig_node; for (auto name : mark_names_) { mark_name = name; @@ -188,42 +188,6 @@ isl::schedule_node CpuMemoryManager::HoistMemory(isl::schedule_node &tree, GpuMe return res_node; } -bool CpuMemoryManager::CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster) { - isl::union_map original = cluster.OrigianlAccessRelations(); - size_t tensor_dim = cluster.foot_print_.GetBoxDim(); - std::vector thread_marker = CollectFnNode(IsThreadMappedMark, schedule_.root()); - for (auto item : thread_marker) { - if (!(item.isa()) && !(item.has_children()) && - !(item.child(0).isa())) { - continue; - } - isl::schedule_node thread_filter = item.child(0); - if (!thread_filter.has_children()) { - continue; - } - isl::schedule_node thread_band = thread_filter.child(0); - if (!thread_band.has_children()) { - continue; - } - isl::schedule_node inner_band = thread_band.child(0); - size_t num_mapped_thread = inner_band.schedule_depth() - thread_band.schedule_depth(); - if (num_mapped_thread == 0) { - continue; - } - size_t inner_depth = inner_band.schedule_depth(); - auto active_domains = CollectDomain(thread_band); - auto local_access = original.intersect_domain(active_domains); - auto schedule = ShortSchedule(inner_band); - auto schedule_access = local_access.apply_domain(schedule); - for (auto access : schedule_access.get_map_list()) { - if (!IsSubsetForIncreaseDim(access, tensor_dim - 1, inner_depth - 1)) { - return true; - } - } - } - return false; -} - isl::schedule CpuMemoryManager::InsertVectorizedMarker(const isl::schedule &sch) { auto GetPromotedWriteFilter = [this](isl::schedule_node node) -> isl::schedule_node { if (!node.isa() || !node.has_parent() || !node.parent().isa()) { diff --git a/src/poly/schedule_pass_cpu/cpu_memory_manager.h b/src/poly/schedule_pass_cpu/cpu_memory_manager.h index bfd1528dfb0cdeb0fe4a923923bfad3eb1ec8a47..0a90a21f0ade438f9b1770934551bd60d4e94a4a 100644 --- a/src/poly/schedule_pass_cpu/cpu_memory_manager.h +++ b/src/poly/schedule_pass_cpu/cpu_memory_manager.h @@ -43,8 +43,6 @@ class CpuMemoryManager : public SchedulePass { const isl::id &dst_tensor_id, TensorFootprintCluster &cluster, bool force_last_extension_odd); - bool CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster); - isl::schedule InsertVectorizedMarker(const isl::schedule &sch); isl::schedule HoistCpuMemory(); diff --git a/src/poly/schedule_pass_gpu/mapping_outer_band.cc b/src/poly/schedule_pass_gpu/mapping_outer_band.cc index 50c71391aceb5f63a83728d26a9007be4ba35653..0597d286fab56c8379755aa0295ca07fcea3666c 100644 --- a/src/poly/schedule_pass_gpu/mapping_outer_band.cc +++ b/src/poly/schedule_pass_gpu/mapping_outer_band.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -401,31 +401,35 @@ isl::schedule_node MappingOuterBand::DoThreadMapping(const isl::schedule_node &o thread_record.emplace_back(std::make_pair(node, mapped_threads)); return node; } - - if (node.n_children() <= 1 || NumMappedDescendant(thread_record, node) <= 0) { - return node; - } - node = MapSequenceNode(node, thread_record); - - auto need_sync = node.isa(); - if (need_sync) { - if (is_reduce_stmt && node.has_parent() && !GetMarkerName(node.parent(), INSERT_SYNC).empty()) { - node = node.parent().del(); - node = DoThreadSynchronization(node); - } else if (!is_reduce_stmt && scop_info_.user_config_.GetEnableTensorCoreUsePoly()) { - std::vector other_mapping_cfg; - other_mapping_cfg.push_back(scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE]); - node = DoThreadSynchronization(node, other_mapping_cfg); - } else if (!is_reduce_stmt) { - node = DoThreadSynchronization(node); - } - } - + node = DoSequenceNodeMapping(node, thread_record, is_reduce_stmt); return node; }; return orig_node.map_descendant_bottom_up(MapFromInner); } +isl::schedule_node MappingOuterBand::DoSequenceNodeMapping(const isl::schedule_node &orig_node, + const RoadMap &thread_record, const bool is_reduce_stmt) { + if (orig_node.n_children() <= 1 || NumMappedDescendant(thread_record, orig_node) <= 0) { + return orig_node; + } + isl::schedule_node node = MapSequenceNode(orig_node, thread_record); + + auto need_sync = node.isa(); + if (need_sync) { + if (is_reduce_stmt && node.has_parent() && !GetMarkerName(node.parent(), INSERT_SYNC).empty()) { + node = node.parent().del(); + node = DoThreadSynchronization(node); + } else if (!is_reduce_stmt && scop_info_.user_config_.GetEnableTensorCoreUsePoly()) { + std::vector other_mapping_cfg; + other_mapping_cfg.push_back(scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE]); + node = DoThreadSynchronization(node, other_mapping_cfg); + } else if (!is_reduce_stmt) { + node = DoThreadSynchronization(node); + } + } + return node; +} + void MappingOuterBand::AdjustBlockConfig(MappingCfg *block_cfg, unsigned long n_block_map) { // For scalar case that do not consider coincidence (reset during restart in pass mgr), there is usually only one // member in outer band and we can map the maximal block size to that member. diff --git a/src/poly/schedule_pass_gpu/mapping_outer_band.h b/src/poly/schedule_pass_gpu/mapping_outer_band.h index c3ee450ac1f8a6c20ef4ad7acd2a804ae868936f..10c45fff7429b865e1363411560a3f1312ddcad5 100644 --- a/src/poly/schedule_pass_gpu/mapping_outer_band.h +++ b/src/poly/schedule_pass_gpu/mapping_outer_band.h @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,11 +47,15 @@ class MappingOuterBand : public SchedulePass { size_t NumMappedDescendant(const RoadMap &thread_roadmap, const isl::schedule_node &parent); bool CanBeMappedToThread(const isl::schedule_node &node, const RoadMap &thread_record, const std::string &marker_name); - isl::schedule_node FillRemainingThreads(const isl::schedule_node &orig_node, size_t begin); - isl::schedule_node MapSequenceNode(const isl::schedule_node &orig_node, const RoadMap &thread_record); bool IsEnableReduceLib(const isl::schedule_node &orig_node); void AdjustBlockConfig(MappingCfg *block_cfg, unsigned long n_block_map); + // Sequence node mapping + isl::schedule_node DoSequenceNodeMapping(const isl::schedule_node &orig_node, const RoadMap &thread_record, + const bool is_reduce_stmt); + isl::schedule_node MapSequenceNode(const isl::schedule_node &orig_node, const RoadMap &thread_record); + isl::schedule_node FillRemainingThreads(const isl::schedule_node &orig_node, size_t begin); + // Functions related to synchronization. isl::schedule_node DoThreadSynchronization(const isl::schedule_node &node, const std::vector &other_mapping_cfg = {}); diff --git a/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc b/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc index 730a11226e53689e5417814da6e54cae9a5b383a..5310657106e6284b85df6ed562ddddff00f74b42 100644 --- a/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc +++ b/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021 Huawei Technologies Co., Ltd + * Copyright 2021-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -136,8 +136,10 @@ isl::schedule_node OperatorMappingStrategy::MapDimToThreadsBlocks(const isl::sch if (is_promotion_mapping_ || scop_info_.user_config_.GetMindTrickWasUsed()) { node = CheckMapSizeAndApplyTile(node, upa_list, required_mapping_strategy_, mapping_cfg_); is_tiled = !node.is_equal(orig_node); + // insert node with specific marker if (is_insert_marker) { std::string marker_name = is_thread_mapping_ ? THREAD_MARKER : BLOCK_MARKER; + marker_name = is_promotion_mapping_ ? marker_name + SHARE_SUFFIX : marker_name; node = node.insert_mark(isl::id(node.ctx(), marker_name)).child(0); } @@ -600,8 +602,9 @@ isl::schedule_node ReduceMappingStrategy::InsertReduceExtension(const isl::sched insert_node = InsertExtensionNodeBeforeOrAfter(insert_node, sync_id, false).parent(); insert_node = insert_node.parent().insert_mark(REDUCE_AREA_FLAG); - if (!GetMarkerName(insert_node.ancestor(2), REDUCE_MARKER).empty()) { - insert_node = insert_node.ancestor(2).del(); + auto tmp_node = insert_node.parent().parent(); + if (!GetMarkerName(tmp_node, REDUCE_MARKER).empty()) { + insert_node = tmp_node.del(); } return insert_node; diff --git a/src/poly/schedule_pass_gpu/operator_shared_strategy.cc b/src/poly/schedule_pass_gpu/operator_shared_strategy.cc deleted file mode 100644 index 3d97167279479d07611425c82b4ee3902ab72c17..0000000000000000000000000000000000000000 --- a/src/poly/schedule_pass_gpu/operator_shared_strategy.cc +++ /dev/null @@ -1,291 +0,0 @@ -/** - * Copyright 2021-2022 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "operator_shared_strategy.h" -#include "poly/schedule_tree_util.h" -#include "poly/scop.h" -#include "poly/dma_inject.h" -#include "poly/poly_util.h" -#include -#include - -namespace akg { -namespace ir { -namespace poly { - -std::set OperatorSharedStrategy::GetInitPromotedTensor() { - auto read_map = scop_info_.StmtReadMap(); - auto write_map = scop_info_.StmtWriteMap(); - std::set id_sets; - std::set read_sets; - std::set write_sets; - for (auto item : read_map) { - for (auto item_id : item.second) { - if (read_sets.count(item_id.get_name()) == 0) { - read_sets.insert(item_id.get_name()); - } - } - } - for (auto item : write_map) { - for (auto item_id : item.second) { - if (write_sets.count(item_id.get_name()) == 0) { - write_sets.insert(item_id.get_name()); - } - } - } - /********************************************************* - * manage only read tensors to share memory - * for read and write tensor, should be managed to local memory - ********************************************************/ - std::set_difference(read_sets.begin(), read_sets.end(), write_sets.begin(), write_sets.end(), - std::inserter(id_sets, id_sets.begin())); - - if (scop_info_.analysis_result_.GetTensorOfTensor() || - scop_info_.analysis_result_.GetOpTemplate() == Template::COUNT_OP) { - id_sets.clear(); - std::set_union(read_sets.begin(), read_sets.end(), write_sets.begin(), write_sets.end(), - std::inserter(id_sets, id_sets.begin())); - } - - return id_sets; -} - -void OperatorSharedStrategy::RecordPromotedTensorInfo(const isl::schedule_node &orig_node, - const std::set &id_sets, - const std::string &mark_name) { - std::vector tensor_list; - for (auto item : id_sets) { - tensor_list.push_back(isl::id(scop_info_.ctx_, item)); - } - isl::union_map reads = scop_info_.analysis_result_.GetReads(); - isl::union_map writes = scop_info_.analysis_result_.GetWrites(); - isl::union_map copyin = scop_info_.analysis_result_.GetCopyin(); - isl::union_map fake_copyin = scop_info_.analysis_result_.GetFakeCopyin(); - - std::vector nodes = CollectMarkNode(orig_node, mark_name); - - // Collect block config. - auto block_cfg = scop_info_.user_config_.GetBlockConfig(); - CHECK(block_cfg != nullptr) << "block config is null"; - auto replace_cfg = scop_info_.user_config_.GetReplaceConfig(); - MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(band_index_); - std::unordered_set non_repeated_idx = GetNonRepeatedIdx(mapping_strategy); - auto mapping_filter_info = GetMappingFilterInfo(orig_node.root(), block_cfg, replace_cfg, non_repeated_idx); - - for (const auto &node : nodes) { - auto tree = node.parent(); - CHECK(!IsAncestorMapToThread(tree)) << "shared memory promotion cannot below thread_marker."; - auto partial_sched = LocalSchedule(tree); - if (!mapping_filter_info.is_empty()) { - partial_sched = partial_sched.intersect_domain(mapping_filter_info); - } - - for (const auto &item : tensor_list) { - GpuMemType gpu_mem_type = GpuMemType::SHARED; - MemType mem_type = MemType::SHARED_; - if (is_local_) { - gpu_mem_type = GpuMemType::LOCAL; - mem_type = MemType::LOCAL_; - } - - isl::id dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, item); - if (scop_info_.IsCopyinTensor(item.get_name()) && band_index_ != 0) { - dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, item, band_index_); - } - std::vector buffer_sizes; - std::vector> data_stream; - data_stream.push_back(std::make_pair(item, MemType::DDR)); - data_stream.push_back(std::make_pair(item, mem_type)); - BufferDefInfo promoted_info = BufferDefInfo{item, - dst_tensor_id, - item, - MemType::DDR, - mark_name, - false, - false, - data_stream, - Tensor(), - Handle(), - buffer_sizes, - nullptr, - isl::union_map::empty(isl::space(scop_info_.ctx_, 0))}; - promoted_info.footprints_cluster = - TensorFootprintCluster::HoistBufferFootprintCluster(partial_sched, item, reads, copyin, writes, fake_copyin); - if (promoted_info.footprints_cluster == nullptr) { - continue; - } - promoted_info.footprint_cluster_map.emplace_back(std::make_pair(tree, promoted_info.footprints_cluster)); - scop_info_.analysis_result_.buffer_def_infos_.push_back(promoted_info); - } - } -} - -void OperatorSharedStrategy::RecordCustomPromotedTensors(std::set &id_sets) { - if (scop_info_.user_config_.GetSharedTensors().empty()) { - return; - } - std::vector configed_tensors = Split(scop_info_.user_config_.GetSharedTensors(), " "); - for (const auto &item : configed_tensors) { - if (id_sets.count(item) == 0) { - id_sets.emplace(item); - } - } -} - -void OperatorSharedStrategy::DeleteNotPromotedTensors(std::set &id_sets) { - if (scop_info_.analysis_result_.GetTensorsNotPromote().empty()) { - return; - } - std::unordered_set tensors = scop_info_.analysis_result_.GetTensorsNotPromote(); - for (const auto &item : tensors) { - if (id_sets.count(item)) { - id_sets.erase(item); - } - } -} - -void OperatorSharedStrategy::CreateClusterList(const isl::schedule_node &node) { - std::set id_sets = GetInitPromotedTensor(); - RecordCustomPromotedTensors(id_sets); - DeleteNotPromotedTensors(id_sets); - for (auto mark_name : mark_names_) { - RecordPromotedTensorInfo(node, id_sets, mark_name); - } -} - -void ReduceSharedStrategy::CreateClusterList(const isl::schedule_node &node) { - std::set id_sets = AnalysisReduceTensors(); - RecordCustomPromotedTensors(id_sets); - for (auto mark_name : mark_names_) { - RecordPromotedTensorInfo(node, id_sets, mark_name); - } -} - -std::set ReduceSharedStrategy::AnalysisReduceTensors() { - std::set id_sets; - /************************************************* - * In order to enable cuda atomic operator, add - * these tensors for shared memory promotion list - *************************************************/ - auto atomic_tensors = scop_info_.analysis_result_.GetAtomicTensors(); - if (!atomic_tensors.empty()) { - for (const auto &item : atomic_tensors) { - if (id_sets.count(item.tensor_name) == 0) { - id_sets.emplace(item.tensor_name); - } - } - } - - /*********************************************** - * For the condition that it is without cuda - * atomic usage, but with reduce operation. - * Also need to add these tensors for shared memory - * promotion list. - *********************************************/ - auto reduce_out_tensors = scop_info_.analysis_result_.GetReduceTensorInfoMap(); - for (const auto &item : reduce_out_tensors) { - if (id_sets.count(item.second.write_tensor_name) == 0) { - id_sets.emplace(item.second.write_tensor_name); - } - } - - return id_sets; -} - -void BatchMatmulSharedStrategy::CreateClusterList(const isl::schedule_node &node) { - std::set id_sets = GetInitPromotedTensor(); - RecordCustomPromotedTensors(id_sets); - - auto tensors = GetMatmulTensorsName(scop_info_); - if (id_sets.count(tensors[MATRIX_A]) == 0) { - id_sets.emplace(tensors[MATRIX_A]); - } - if (id_sets.count(tensors[MATRIX_B]) == 0) { - id_sets.emplace(tensors[MATRIX_B]); - } - - auto DeleteTensorSets = [this](const std::set &id_sets, - const bool hoist_tensor_c) -> std::set { - std::set final_id_sets = id_sets; - auto it = final_id_sets.begin(); - while (it != final_id_sets.end()) { - if (!hoist_tensor_c) { - if (GetTensorMark(*it, scop_info_) == TENSOR_C) { - it = final_id_sets.erase(it); - continue; - } - } else { - if (GetTensorMark(*it, scop_info_) != TENSOR_C) { - it = final_id_sets.erase(it); - continue; - } - } - ++it; - } - return final_id_sets; - }; - - for (auto mark_name : mark_names_) { - bool hoist_tensor_c = mark_name == PROMOTE_GLOBAL_TO_SHARED_C; - auto final_id_sets = DeleteTensorSets(id_sets, hoist_tensor_c); - RecordPromotedTensorInfo(node, final_id_sets, mark_name); - } -} - -std::set CpuMemoryStrategy::GetInitPromotedTensor() { - auto read_map = scop_info_.StmtReadMap(); - auto write_map = scop_info_.StmtWriteMap(); - std::set id_sets; - for (auto item : read_map) { - for (auto item_id : item.second) { - if (id_sets.count(item_id.get_name()) == 0) { - id_sets.insert(item_id.get_name()); - } - } - } - - for (auto item : write_map) { - for (auto item_id : item.second) { - if (id_sets.count(item_id.get_name()) == 0) { - id_sets.insert(item_id.get_name()); - } - } - } - - return id_sets; -} - -void CpuMemoryStrategy::CreateClusterList(const isl::schedule_node &node) { - std::set id_sets = GetInitPromotedTensor(); - RecordCustomPromotedTensors(id_sets); - DeleteNotPromotedTensors(id_sets); - auto it = id_sets.begin(); - while (it != id_sets.end()) { - if (GetTensorMark(*it, scop_info_) == TENSOR_C) { - it = id_sets.erase(it); - continue; - } - ++it; - } - - for (auto mark_name : mark_names_) { - RecordPromotedTensorInfo(node, id_sets, mark_name); - } -} - -} // namespace poly -} // namespace ir -} // namespace akg diff --git a/src/poly/schedule_pass_gpu/operator_shared_strategy.h b/src/poly/schedule_pass_gpu/operator_shared_strategy.h deleted file mode 100644 index 124caf8737c9199fa0757ea4c53734ca2772e355..0000000000000000000000000000000000000000 --- a/src/poly/schedule_pass_gpu/operator_shared_strategy.h +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Copyright 2021 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef POLY_OPRATOR_SHARED_STRATEGY_H_ -#define POLY_OPRATOR_SHARED_STRATEGY_H_ - -#include "poly/schedule_pass.h" - -namespace akg { -namespace ir { -namespace poly { - -class OperatorSharedStrategy { - public: - explicit OperatorSharedStrategy(ScopInfo &scop_info, std::unordered_set &mark_names, int filter_pos) - : scop_info_(scop_info), mark_names_(mark_names), band_index_(filter_pos) {} - ~OperatorSharedStrategy() {} - - std::set GetInitPromotedTensor(); - void RecordPromotedTensorInfo(const isl::schedule_node &orig_node, const std::set &id_sets, - const std::string &mark_names); - void CreateClusterList(const isl::schedule_node &node); - void RecordCustomPromotedTensors(std::set &id_sets); - void DeleteNotPromotedTensors(std::set &id_sets); - - protected: - ScopInfo &scop_info_; - std::unordered_set mark_names_; - int band_index_; - bool is_local_{false}; -}; - -class ReduceSharedStrategy : public OperatorSharedStrategy { - public: - explicit ReduceSharedStrategy(ScopInfo &scop_info, std::unordered_set &mark_names, int filter_pos) - : OperatorSharedStrategy(scop_info, mark_names, filter_pos) {} - ~ReduceSharedStrategy() {} - - void CreateClusterList(const isl::schedule_node &node); - std::set AnalysisReduceTensors(); -}; - -class BatchMatmulSharedStrategy : public OperatorSharedStrategy { - public: - explicit BatchMatmulSharedStrategy(ScopInfo &scop_info, std::unordered_set &mark_names, int filter_pos) - : OperatorSharedStrategy(scop_info, mark_names, filter_pos) {} - ~BatchMatmulSharedStrategy() {} - - void CreateClusterList(const isl::schedule_node &node); -}; - -class CpuMemoryStrategy : public OperatorSharedStrategy { - public: - explicit CpuMemoryStrategy(ScopInfo &scop_info, std::unordered_set &mark_names, int filter_pos) - : OperatorSharedStrategy(scop_info, mark_names, filter_pos) { - is_local_ = true; - } - ~CpuMemoryStrategy() {} - std::set GetInitPromotedTensor(); - void CreateClusterList(const isl::schedule_node &node); -}; -} // namespace poly -} // namespace ir -} // namespace akg - -#endif // POLY_OPRATOR_SHARED_STRATEGY_H_ \ No newline at end of file diff --git a/src/poly/schedule_pass_gpu/register_memory_manager.cc b/src/poly/schedule_pass_gpu/register_memory_manager.cc index 3ca6fef22b83e330b9281f7b180a2ee1679d78da..7cec1b89dc485aa42bb9fb1919cf645aa9313fe6 100644 --- a/src/poly/schedule_pass_gpu/register_memory_manager.cc +++ b/src/poly/schedule_pass_gpu/register_memory_manager.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,332 +15,192 @@ */ #include "register_memory_manager.h" - -#include - +#include "poly/create_cluster.h" #include "poly/scop.h" #include "poly/dma_inject.h" #include "poly/poly_util.h" +#include namespace akg { namespace ir { namespace poly { -void RegisterMemoryManager::GetActualPromotedSharedTensors() { - for (const auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) { - auto cluster_id = buffer.second.cluster_id; - shared_tensors_ += cluster_id.name() + " "; +isl::schedule RegisterMemoryManager::Run(isl::schedule sch) { + if (!scop_info_.user_config_.GetUseRegisterMemory()) { + return sch; } -} -isl::schedule RegisterMemoryManager::HoistRegisterMemoryOnDepth(isl::schedule_node &node, size_t depth) { - auto res_node = node; - isl::schedule_node root_node = node.get_schedule().get_root(); + schedule_ = sch; + sch = HoistRegisterMemory(); + return sch; +} - auto block_cfg = scop_info_.user_config_.GetBlockConfig(); - CHECK(block_cfg != nullptr) << "block config is null"; - auto replace_cfg = scop_info_.user_config_.GetReplaceConfig(); - MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(0); - std::unordered_set non_repeated_idx = GetNonRepeatedIdx(mapping_strategy); - auto block_mapping = GetMappingFilterInfo(root_node, block_cfg, replace_cfg, non_repeated_idx); - - auto thread_cfg = scop_info_.user_config_.GetThreadConfig(); - CHECK(thread_cfg != nullptr) << "thread config is null"; - auto thread_mapping = isl::union_set::empty(block_mapping.ctx()); - mapping_strategy = scop_info_.user_config_.GetInnerMappingStrategy(0); - non_repeated_idx = GetNonRepeatedIdx(mapping_strategy); - thread_mapping = GetMappingFilterInfo(root_node, thread_cfg, replace_cfg, non_repeated_idx); - - auto partial_sched = LocalSchedule(node); - if (!thread_mapping.is_empty() && !block_mapping.is_empty()) { - auto mapping = block_mapping.intersect(thread_mapping); - partial_sched = partial_sched.intersect_domain(mapping); - } else if (!thread_mapping.is_empty()) { - partial_sched = partial_sched.intersect_domain(thread_mapping); - } else if (!block_mapping.is_empty()) { - partial_sched = partial_sched.intersect_domain(block_mapping); +isl::schedule_node RegisterMemoryManager::HoistRegisterMemoryOnMark(const isl::schedule_node &orig_node) { + current_outer_bn_ = scop_info_.analysis_result_.GetOuterBandNode(band_index_); + if (!current_outer_bn_->use_register_memory) { + return orig_node; } - CreateTensorCluster(node, partial_sched); + CreateClusterForOperator(orig_node); - isl::schedule sch = schedule_; + std::string mark_name; + auto GetMarkNode = [this, &mark_name](isl::schedule_node node) -> isl::schedule_node { + if (!node.isa()) { + return node; + } - auto thread_schedule = MapDomainAllWithType(root_node, thread_cfg, scop_info_.upa_node_mapping_, THREAD_MARKER); - auto block_schedule = MapDomainAllWithType(root_node, block_cfg, scop_info_.upa_node_mapping_, BLOCK_MARKER); + std::string tmp_mark_name = node.as().get_id().get_name(); + if (tmp_mark_name != mark_name) { + return node; + } - auto tmp_node = res_node; - if (node.isa()) { - tmp_node = res_node.child(0); - } + return HoistClusters(node.parent()).child(0); + }; - int64_t alloc_threads = 1; - if (thread_cfg != nullptr) { - for (size_t i = 0; i < thread_cfg->bound; ++i) { - alloc_threads *= thread_cfg->GetAt(i).second; - } + auto node = orig_node; + for (auto name : mark_names_) { + mark_name = name; + node = MapDescendantTopDown(node, GetMarkNode); } + node = InsertMarkerForEmit(node); + node = DeleUselessMarker(node, mark_names_); + return node; +} - auto partial_sched_mupa = ShortScheduleMupa(root_node, tmp_node); - auto partial_sched_with_block = isl::union_map::from(partial_sched_mupa).intersect_domain(block_mapping); - partial_sched_mupa = partial_sched_mupa.flat_range_product(block_schedule).flat_range_product(thread_schedule); - for (size_t index = 0; index < scop_info_.analysis_result_.buffer_def_infos_.size(); index++) { - BufferDefInfo &buffer_info = scop_info_.analysis_result_.buffer_def_infos_[index]; - - if (buffer_info.dst_tensor_id.to_str().find(SHARE_SUFFIX) != std::string::npos) { - continue; - } +isl::schedule RegisterMemoryManager::HoistRegisterMemory() { + isl::schedule_node node = GetOuterBand(schedule_.root()); + if (node.isa()) { + node = HoistRegisterMemoryOnMark(node); + } else { + int number = static_cast(node.n_children()); + for (int i = 0, current_band_index = 0; i < number; ++i) { + auto promotion_node = node.child(i).child(0); + if (promotion_node.isa()) continue; - if (scop_info_.user_config_.GetEnableMatmul() && !hoist_tensor_all_) { - if (!hoist_compute_local_tensor_) { - if (GetTensorMark(buffer_info.dst_tensor_id.get_name(), scop_info_) == TENSOR_C) { - continue; - } - } else { - if (GetTensorMark(buffer_info.dst_tensor_id.get_name(), scop_info_) != TENSOR_C) { - continue; - } - } + mark_names_.clear(); + band_index_ = current_band_index; + node = HoistRegisterMemoryOnMark(promotion_node); + node = node.parent().parent(); + ++current_band_index; } + } - auto fp_cluster = buffer_info.GetFootPrintClusterGPU(res_node); + return node.get_schedule(); +} - if (fp_cluster == nullptr || !fp_cluster->foot_print_.box.is_valid()) { - continue; - } +void RegisterMemoryManager::SetPromotedWriteNameForGemm(std::string &local_tensor_c) { + write_name_ = GML_WRITE_ID_NAME; + std::string shared_tensors; + for (const auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) { + auto cluster_id = buffer.second.cluster_id; + shared_tensors += cluster_id.name() + " "; + } + if (shared_tensors.find(local_tensor_c) != std::string::npos) { + write_name_ = SHARED_WRITE_ID_NAME; + } +} - auto tensor_id = buffer_info.tensor_id; - auto box_sizes = fp_cluster->GetFixedBoxSizes(); +void RegisterMemoryManager::CreateClusterForOperator(const isl::schedule_node &node) { + RegisterCreateCluster create_cluster(scop_info_, band_index_); + if (scop_info_.user_config_.GetEnableMatmul()) { + // matmul operator + std::string local_tensor_c = GetMatmulTensorsName(scop_info_)[MATRIX_C]; + SetPromotedWriteNameForGemm(local_tensor_c); - if (box_sizes.size() == 0) { - LOG(FATAL) << "Can not manage a scalar tensor in register memory promotion"; + auto config_shared_tensors = scop_info_.user_config_.GetSharedTensors(); + auto c_mark = PROMOTE_GLOBAL_TO_REGISTER_C; + if (config_shared_tensors.find(local_tensor_c) != config_shared_tensors.end()) { + c_mark = PROMOTE_SHARED_TO_REGISTER_C; } - if (!IsPromote(*fp_cluster, partial_sched_mupa, thread_schedule)) { - continue; - } + mark_names_.emplace(PROMOTE_SHARED_TO_REGISTER_AB); + mark_names_.emplace(c_mark); - if (!scop_info_.user_config_.GetEnableTensorCore() && !scop_info_.user_config_.GetEnableMatmul() && - !scop_info_.user_config_.GetEnableVectorization()) { - if (!ReuseTensorCluster(*fp_cluster, partial_sched_mupa)) { - continue; - } - } + create_cluster.CreateClusterListForGemm(node, mark_names_); + } else { + mark_names_.emplace(PROMOTE_GLOBAL_TO_REGISTER); + create_cluster.CreateClusterListForElementWise(node, mark_names_); + } +} - auto tensor_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies()); +isl::schedule_node RegisterMemoryManager::InsertMarkerForEmit(const isl::schedule_node &orig_node) { + auto node = orig_node; + if (scop_info_.user_config_.GetEnableMatmul()) { if (scop_info_.user_config_.GetEnableTensorCoreUsePoly()) { - tensor_size = (buffer_info.tensor_id.get_name() == local_tensor_c_) ? (tensor_size / alloc_threads) - : (tensor_size * 2 / alloc_threads); + node = TileTensorAccordingInterfaceValue(orig_node); } - auto data_bytes = scop_info_.user_config_.GetDataBytes(buffer_info.tensor_id.get_name()); - tensor_size = tensor_size * std::max(1, data_bytes / BYTES_PER_REGISTER); - size_t memory_requirement = tensor_size * alloc_threads; - - if (memory_requirement <= remain_memory_) { - auto active_domains = CollectDomain(res_node); - isl::id dst_tensor_id = buffer_info.dst_tensor_id; - GatherBufferFootprintDefInfo(res_node, buffer_info); - if (scop_info_.user_config_.GetEnableMatmul()) { - if (tensor_id.get_name().find(SHARE_SUFFIX) != std::string::npos) { - std::shared_ptr src_fp_cluster; - isl::union_map sch_map = scop_info_.analysis_result_.GetScheduleMapBeforeTile(); - for (auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) { - if (tensor_id == buffer.second.cluster_id) { - src_fp_cluster = buffer.second.cluster; - break; - } - } - if (src_fp_cluster != nullptr) { - node = PlaceInnerDataCopyBelow(scop_info_, node, *fp_cluster, *src_fp_cluster, tensor_id, dst_tensor_id, - tensor_id, sch_map); - } - } else { - node = PlaceOuterDataCopyBelow(scop_info_, node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched, - schedule_.get_domain().get_space()); - } - } else { - node = PlaceOuterDataCopyBelow(scop_info_, node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched, - schedule_.get_domain().get_space()); - } - - remain_memory_ -= memory_requirement; - - // active_buffer_footprints for codegen - scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair( - active_domains, BufferedFootPrintInfo{std::shared_ptr(std::move(fp_cluster)), - partial_sched, dst_tensor_id})); - buffer_info.find_buffer = true; + std::string marker_name = PROMOTE_REGISTER_TO_GLOBAL; + if (write_name_ == SHARED_WRITE_ID_NAME) { + marker_name = PROMOTE_REGISTER_TO_SHARED; } + node = InsertMarkerForThreadGroup(node, write_name_, marker_name); + } else if (current_outer_bn_->enable_vectorization) { + node = InsertMarkerForThreadGroup(node, GML_READ_ID_NAME, PROMOTE_VECTORIZATION); + node = InsertMarkerForThreadGroup(node, GML_WRITE_ID_NAME, PROMOTE_VECTORIZATION); } - sch = node.get_schedule(); - return sch; + return node; } -/*Check if the given "group" can be promoted to registers for the given - * mapping to thread identifiers and within the given outer schedule */ -bool RegisterMemoryManager::IsPromote(const TensorFootprintCluster &fp_cluster, - const isl::multi_union_pw_aff &partial_sched_mupa, - const isl::multi_union_pw_aff &thread_schedule) { - /* compute the mapping relation between single thread and outer schedule space and tensor elements pair */ - isl::union_map state_schedule_mapping = - ScheduleTensorMapping(partial_sched_mupa, fp_cluster.OrigianlAccessRelations()); - isl::union_map thread_schedule_mapping = state_schedule_mapping.apply_domain(isl::union_map::from(thread_schedule)); - /* check that whether the mapping relation between single thread - * and outer schedule points and group elements pair is injective. */ - return thread_schedule_mapping.is_injective(); -} +isl::schedule_node RegisterMemoryManager::HoistClusters(const isl::schedule_node &node) { + auto res_node = node; + isl::schedule_node root_node = node.get_schedule().get_root(); -void RegisterMemoryManager::CreateTensorCluster(const isl::schedule_node &node, const isl::union_map &outer_sch) { - isl::union_map reads = scop_info_.analysis_result_.GetReads(); - isl::union_map writes = scop_info_.analysis_result_.GetWrites(); - isl::union_map copyin = scop_info_.analysis_result_.GetCopyin(); - isl::union_map fake_copyin = scop_info_.analysis_result_.GetFakeCopyin(); - - auto read_map = scop_info_.StmtReadMap(); - auto write_map = scop_info_.StmtWriteMap(); - auto stmt_map = scop_info_.analysis_result_.GetStmtOpInfoMap(); - std::vector tensor_list; - std::unordered_set id_sets; - for (auto item : read_map) { - for (auto item_id : item.second) { - id_sets.insert(item_id); - } - } - for (auto item : write_map) { - for (auto item_id : item.second) { - id_sets.insert(item_id); - } - } + isl::schedule sch = schedule_; - std::set shared_dst_tensor_ids; - for (auto buffer : scop_info_.analysis_result_.active_buffer_footprints_) { - shared_dst_tensor_ids.insert(buffer.second.cluster_id.get_name()); - } - if (!configed_tensors_.empty()) { - std::set tensor_sets; - for (const auto &item : configed_tensors_) { - if (tensor_sets.count(item) == 0) { - tensor_sets.emplace(item); - } - } - id_sets.clear(); - for (auto item : tensor_sets) { - id_sets.insert(isl::id(scop_info_.ctx_, item)); - } - } + for (size_t index = 0; index < scop_info_.analysis_result_.buffer_def_infos_.size(); index++) { + BufferDefInfo &buffer_info = scop_info_.analysis_result_.buffer_def_infos_[index]; - for (auto item : id_sets) { - if (scop_info_.user_config_.GetEnableMatmul()) { - tensor_list.push_back(item); - } else { - if (shared_dst_tensor_ids.count(item.get_name() + SHARE_SUFFIX)) { - continue; - } + if (buffer_info.dst_tensor_id.to_str().find(SHARE_SUFFIX) != std::string::npos) { + continue; + } - std::unordered_set tensors = scop_info_.analysis_result_.GetTensorsNotPromote(); - if (tensors.count(item.get_name())) { - continue; - } + auto fp_cluster = buffer_info.GetFootPrintClusterGPU(node); - tensor_list.push_back(item); + if (fp_cluster == nullptr || !fp_cluster->foot_print_.box.is_valid()) { + continue; } - } - std::vector promoted_infos; + auto tensor_id = buffer_info.tensor_id; + RegisterCreateCluster create_cluster(scop_info_, band_index_); + isl::union_map partial_sched = create_cluster.GetPartialSchedule(node); - for (const auto &item : tensor_list) { - if (scop_info_.user_config_.GetEnableMatmul() && !hoist_tensor_all_) { - if (!hoist_compute_local_tensor_) { - if (GetTensorMark(item.get_name(), scop_info_) == TENSOR_C) { - continue; + auto active_domains = CollectDomain(node); + isl::id dst_tensor_id = buffer_info.dst_tensor_id; + GatherBufferFootprintDefInfo(node, buffer_info); + if (scop_info_.user_config_.GetEnableMatmul()) { + if (tensor_id.get_name().find(SHARE_SUFFIX) != std::string::npos) { + std::shared_ptr src_fp_cluster; + isl::union_map sch_map = scop_info_.analysis_result_.GetScheduleMapBeforeTile(); + for (auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) { + if (tensor_id == buffer.second.cluster_id) { + src_fp_cluster = buffer.second.cluster; + break; + } } - } else { - if (GetTensorMark(item.get_name(), scop_info_) != TENSOR_C) { - continue; + if (src_fp_cluster != nullptr) { + if (!GetMarkerName(res_node.child(0), PROMOTE_SHARED_TO_REGISTER_C).empty()) { + res_node = res_node.child(0).del(); + res_node = res_node.parent(); + } + res_node = PlaceInnerDataCopyBelow(scop_info_, res_node, *fp_cluster, *src_fp_cluster, tensor_id, + dst_tensor_id, tensor_id, sch_map); } + } else { + res_node = PlaceOuterDataCopyBelow(scop_info_, res_node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched, + schedule_.get_domain().get_space()); } - } - - isl::id dst_tensor_id = GetGpuIndexDstId(GpuMemType::LOCAL, item); - std::vector buffer_sizes; - std::vector> data_stream; - MemType memtype; - BufferDefInfo promoted_info; - isl::id tmp_item; - if (!shared_dst_tensor_ids.count(item.get_name() + SHARE_SUFFIX)) { - tmp_item = item; - data_stream.push_back(std::make_pair(item, MemType::DDR)); - data_stream.push_back(std::make_pair(item, MemType::LOCAL_)); - memtype = MemType::DDR; - } else { - tmp_item = isl::id(scop_info_.ctx_, item.get_name() + SHARE_SUFFIX); - data_stream.push_back(std::make_pair(item, MemType::SHARED_)); - data_stream.push_back(std::make_pair(item, MemType::LOCAL_)); - memtype = MemType::SHARED_; - } - promoted_info = BufferDefInfo{tmp_item, - dst_tensor_id, - tmp_item, - memtype, - "", - false, - false, - data_stream, - Tensor(), - Handle(), - buffer_sizes, - nullptr, - isl::union_map::empty(isl::space(scop_info_.ctx_, 0))}; - promoted_info.footprints_cluster = - TensorFootprintCluster::HoistBufferFootprintCluster(outer_sch, item, reads, copyin, writes, fake_copyin); - if (promoted_info.footprints_cluster != nullptr) { - promoted_info.footprint_cluster_map.emplace_back(std::make_pair(node, promoted_info.footprints_cluster)); - promoted_infos.push_back(promoted_info); - } - } - - for (auto promoted_info : promoted_infos) { - auto name = promoted_info.tensor_id.get_name(); - if (CheckRAW(name)) { - scop_info_.analysis_result_.buffer_def_infos_.insert(scop_info_.analysis_result_.buffer_def_infos_.begin(), - promoted_info); } else { - scop_info_.analysis_result_.buffer_def_infos_.push_back(promoted_info); - } - } -} - -bool RegisterMemoryManager::CheckRAW(std::string &name) { - isl::union_map reads = scop_info_.analysis_result_.GetReads(); - isl::union_map reads_filter = isl::union_map::empty(scop_info_.GetCtx()); - auto reads_range = reads.range(); - isl::union_set reads_uset = isl::union_set::empty(scop_info_.GetCtx()); - reads_range.foreach_set([this, name, &reads_uset](const isl::set &s) -> void { - std::string tensor_name = s.get_tuple_name(); - if (tensor_name == name) { - reads_uset = reads_uset.add_set(s); - } - }); - - reads_filter = reads.intersect_range(reads_uset); - isl::union_map writes = scop_info_.analysis_result_.GetWrites(); - isl::union_map writes_filter = isl::union_map::empty(scop_info_.GetCtx()); - auto writes_range = writes.range(); - isl::union_set writes_uset = isl::union_set::empty(scop_info_.GetCtx()); - writes_range.foreach_set([this, name, &writes_uset](const isl::set &s) -> void { - std::string tensor_name = s.get_tuple_name(); - if (tensor_name == name) { - writes_uset = writes_uset.add_set(s); + res_node = PlaceOuterDataCopyBelow(scop_info_, res_node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched, + schedule_.get_domain().get_space()); } - }); - writes_filter = writes.intersect_range(writes_uset); - isl::union_map rawmap = ComputeRAW(scop_info_.origin_schedule_, reads_filter, writes_filter); - // Input/output tensor has lower priority then temp tensor - if (rawmap.is_empty() || scop_info_.IsInBinds(name)) { - return false; - } else { - return true; + // active_buffer_footprints for codegen + scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair( + active_domains, BufferedFootPrintInfo{std::shared_ptr(std::move(fp_cluster)), + partial_sched, dst_tensor_id})); + buffer_info.find_buffer = true; } + return res_node; } void RegisterMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_node &node, BufferDefInfo &tensor_info) { @@ -372,211 +232,51 @@ void RegisterMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_nod tensor_info.AddSize(node, sizes); } -size_t RegisterMemoryManager::UpdateDepth(const isl::schedule_node &node) { - auto band = node.as(); - for (size_t i = 0; i < band.n_member(); i++) { - if (!band.member_get_coincident(i)) { - if (i == 0) { - return band.n_member(); - } else { - return i; - } - } - } - return band.n_member() + node.schedule_depth(); -} - -isl::schedule RegisterMemoryManager::HoistRegisterMemory(isl::schedule_node root, size_t depth) { - auto bands = BandsContainingScheduleDepth(root, depth); - bands = FilterWithFunc( - [root, depth](isl::schedule_node node) { - auto band = node.as(); - return !IsThreadMappedMark(node) || node.schedule_depth() + band.n_member() == depth; - }, - bands); - bands = BandsSplitAfterDepth(bands, root, depth); - - isl::schedule tmp_sch = root.get_schedule(); - if (!bands.size()) { - return tmp_sch; - } - - int distance_to_extension = 3; - for (auto band : bands) { - if (IsThreadMappedMark(band)) { - band = band.child(0); - } - - if (IsReadOrWriteBand(band)) { - continue; - } - - if (band.has_parent() && band.parent().has_parent() && band.parent().parent().has_parent() && - band.ancestor(distance_to_extension) && - band.ancestor(distance_to_extension).isa()) { - break; - } - tmp_sch = HoistRegisterMemoryOnDepth(band, depth); - break; - } - return tmp_sch; -} - -bool RegisterMemoryManager::IsReadOrWriteBand(isl::schedule_node node) { - if (node.parent().isa()) { - auto filter = node.parent().as(); - - isl::union_set uset = filter.get_filter(); - std::vector vset; - uset.foreach_set([&vset](isl::set s) { vset.push_back(s); }); - if (!vset.empty()) { - auto filter_name = vset[0].get_tuple_name(); - if (filter_name == READ_ID_NAME || filter_name == WRITE_ID_NAME) { - return true; - } - } - } - return false; -} - -isl::schedule_node RegisterMemoryManager::GetRegisterPromotedNode(isl::schedule_node &root) { - isl::schedule_node hoist_register_node = root; - root.foreach_descendant_top_down([&hoist_register_node, this](const isl::schedule_node &node) -> bool { - if (node.isa()) { - auto sequence_node = node.as(); - if (sequence_node.parent().isa() && - sequence_node.parent().parent().isa()) { - hoist_register_node = sequence_node.parent().parent(); - return false; - } else if (sequence_node.parent().isa()) { - hoist_register_node = sequence_node.parent(); - return false; - } - } - - if (node.isa()) { - auto mark_node = node.as(); - if (scop_info_.user_config_.GetEnableVectorization()) { - if (mark_node.get_id().get_name() == THREAD_MARKER && - mark_node.child(0).child(0).isa()) { - hoist_register_node = mark_node.child(0).child(0); - return false; - } - } else if (mark_node.get_id().get_name() == THREAD_MARKER && mark_node.parent().isa()) { - hoist_register_node = mark_node.parent(); - return false; - } +// According to the value of the conv interface, the size of the tensor is split to confirm the size of the fragment. +isl::schedule_node RegisterMemoryManager::TileTensorAccordingInterfaceValue(const isl::schedule_node &orig_node) { + CHECK(scop_info_.user_config_.GetReplaceConfig().count(WARP_COMPUTE)) << "Cannot map to warp."; + auto CollectReadWriteFilter = [this](isl::schedule_node node) -> isl::schedule_node { + if (!node.isa()) { + return node; } - return true; - }); - return hoist_register_node; -} - -isl::schedule_node RegisterMemoryManager::PromotedNodeUnderSequence(isl::schedule_node_sequence &node) { - int band_node_num = 0; - auto root = node.get_schedule().get_root(); - auto tmp_node = root; - - for (size_t i = 0; i < node.n_children(); ++i) { - if (IsReadOrWriteBand(node.child(i).child(0))) { - continue; + bool is_all_sets_read_or_write = IsReadOrWriteTensor(node, SHARED_READ_ID_NAME, write_name_); + if (!is_all_sets_read_or_write) { + return node; } - band_node_num += 1; - tmp_node = node.child(i); - } - auto hoist_register_node = root; - if (band_node_num == 1) { - tmp_node.foreach_descendant_top_down([&hoist_register_node](const isl::schedule_node &node) -> bool { - if (node.isa()) { - auto mark_node = node.as(); - if (mark_node.get_id().get_name() == THREAD_MARKER && - mark_node.child(0).child(0).isa()) { - hoist_register_node = mark_node.child(0).child(0); - return false; - } - } - return true; - }); - } - return hoist_register_node; -} + auto start_depth = node.get_tree_depth(); -isl::schedule_node RegisterMemoryManager::GetVectorizationPromotedNode(isl::schedule_node &root) { - isl::schedule_node hoist_register_node = root; - root.foreach_descendant_top_down([&hoist_register_node, this](const isl::schedule_node &node) -> bool { - if (node.isa()) { - auto sequence_node = node.as(); - if (sequence_node.parent().isa() && - sequence_node.parent().parent().isa()) { - hoist_register_node = PromotedNodeUnderSequence(sequence_node); - return false; - } else if (sequence_node.parent().isa()) { - return false; - } + auto band_node = GetCanMappingNode(node); + std::string id_name = GetPromotionTensorName(band_node, scop_info_.analysis_result_.buffer_def_infos_); + if (id_name.empty() || !scop_info_.analysis_result_.GetMatrixMatmulMap().count(id_name) || + !scop_info_.analysis_result_.GetMatrixMatmulMajor().count(id_name)) { + return node; } - if (node.isa()) { - auto mark_node = node.as(); - if (mark_node.get_id().get_name() == THREAD_MARKER && - mark_node.child(0).child(0).isa()) { - hoist_register_node = mark_node.child(0).child(0); - return false; - } + bool is_conv = scop_info_.user_config_.GetEnableConvTensorCore(); + if (is_conv) { + band_node = AdjustConvScheduleTreeStructure(band_node); } - return true; - }); - return hoist_register_node; -} - -isl::schedule RegisterMemoryManager::HoistRegisterMemoryOnMark(isl::schedule_node root) { - std::string config_shared_tensors = scop_info_.user_config_.GetSharedTensors(); - auto c_mark = PROMOTE_GLOBAL_TO_REGISTER_C; - if (config_shared_tensors.find(local_tensor_c_) != std::string::npos) { - c_mark = PROMOTE_SHARED_TO_REGISTER_C; - } - auto mark_node = CollectMarkNode(root, c_mark).at(0); - auto tmp_hoist_node = mark_node.parent(); - - while (!tmp_hoist_node.isa()) { - tmp_hoist_node = tmp_hoist_node.parent(); - } - - auto depth = tmp_hoist_node.child(0).schedule_depth(); - auto hoist_compute_node = tmp_hoist_node.as(); - for (size_t i = 0; i < hoist_compute_node.n_member(); ++i) { - if (!hoist_compute_node.member_get_coincident(i)) { - if (scop_info_.user_config_.GetEnableTensorCoreUsePoly() && i == 0) { - hoist_tensor_all_ = true; - auto hoist_node = mark_node.del().parent(); - auto sch = HoistRegisterMemoryOnDepth(hoist_node, depth); - return sch; - } - hoist_compute_node = hoist_compute_node.split(i); - depth = depth - hoist_compute_node.n_member() + i; + auto mapping_cfg = scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE]; + CHECK(mapping_cfg != nullptr) << "mapping config is null"; + // split member that does not involved in thread mapping + auto mem_size = band_node.as().n_member(); + if (mem_size > mapping_cfg->bound) { + band_node = band_node.as().split(mem_size - mapping_cfg->bound); + band_node = band_node.child(0); } - } - auto sch = HoistRegisterMemoryOnDepth(hoist_compute_node, depth); - auto hoist_ab_root = sch.get_root(); - auto ab_mark = PROMOTE_SHARED_TO_REGISTER_AB; - auto mark_ab_node = CollectMarkNode(hoist_ab_root, ab_mark).at(0); - auto hoist_ab_node = mark_ab_node.del().parent(); - auto hoist_ab_depth = hoist_ab_node.schedule_depth(); - hoist_compute_local_tensor_ = false; - sch = HoistRegisterMemoryOnDepth(hoist_ab_node, hoist_ab_depth); + std::string matrix_name = scop_info_.analysis_result_.GetMatrixMatmulMap()[id_name]; + std::string matrix_major = scop_info_.analysis_result_.GetMatrixMatmulMajor()[id_name]; + isl::multi_val tile_size_val = GetRealTileSizeVal(band_node, matrix_name, matrix_major); + band_node = TileBand(band_node, tile_size_val); - return sch; -} + node = band_node.ancestor(band_node.get_tree_depth() - start_depth); + return node; + }; -std::string RegisterMemoryManager::GetPromotedWriteName() { - std::string write_name = GML_WRITE_ID_NAME; - std::string shared_tensors = shared_tensors_; - if (shared_tensors.find(local_tensor_c_) != std::string::npos) { - write_name = SHARED_WRITE_ID_NAME; - } - return write_name; + return orig_node.map_descendant_bottom_up(CollectReadWriteFilter); } isl::schedule_node RegisterMemoryManager::AdjustConvScheduleTreeStructure(const isl::schedule_node &orig_node) { @@ -632,54 +332,6 @@ isl::schedule_node RegisterMemoryManager::AdjustConvScheduleTreeStructure(const return band_node; } -// According to the value of the conv interface, the size of the tensor is split to confirm the size of the fragment. -isl::schedule_node RegisterMemoryManager::TileTensorAccordingInterfaceValue(isl::schedule_node &root) { - CHECK(scop_info_.user_config_.GetReplaceConfig().count(WARP_COMPUTE)) << "Cannot map to warp."; - std::string write_name = GetPromotedWriteName(); - auto CollectReadWriteFilter = [this, write_name](isl::schedule_node node) -> isl::schedule_node { - if (!node.isa()) { - return node; - } - bool is_all_sets_read_or_write = IsReadOrWriteTensor(node, SHARED_READ_ID_NAME, write_name); - if (!is_all_sets_read_or_write) { - return node; - } - - auto start_depth = node.get_tree_depth(); - - auto band_node = GetCanMappingNode(node); - std::string id_name = GetPromotionTensorName(band_node, scop_info_.analysis_result_.buffer_def_infos_); - if (id_name.empty() || !scop_info_.analysis_result_.GetMatrixMatmulMap().count(id_name) || - !scop_info_.analysis_result_.GetMatrixMatmulMajor().count(id_name)) { - return node; - } - - bool is_conv = scop_info_.user_config_.GetEnableConvTensorCore(); - if (is_conv) { - band_node = AdjustConvScheduleTreeStructure(band_node); - } - - auto mapping_cfg = scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE]; - CHECK(mapping_cfg != nullptr) << "mapping config is null"; - // split member that does not involved in thread mapping - auto mem_size = band_node.as().n_member(); - if (mem_size > mapping_cfg->bound) { - band_node = band_node.as().split(mem_size - mapping_cfg->bound); - band_node = band_node.child(0); - } - - std::string matrix_name = scop_info_.analysis_result_.GetMatrixMatmulMap()[id_name]; - std::string matrix_major = scop_info_.analysis_result_.GetMatrixMatmulMajor()[id_name]; - isl::multi_val tile_size_val = GetRealTileSizeVal(band_node, matrix_name, matrix_major); - band_node = TileBand(band_node, tile_size_val); - - node = band_node.ancestor(band_node.get_tree_depth() - start_depth); - return node; - }; - - return root.map_descendant_bottom_up(CollectReadWriteFilter); -} - isl::multi_val RegisterMemoryManager::GetRealTileSizeVal(const isl::schedule_node &node, const std::string &matrix_name, const std::string &matrix_major) { auto ctx = node.ctx(); @@ -713,129 +365,6 @@ isl::multi_val RegisterMemoryManager::GetRealTileSizeVal(const isl::schedule_nod return tile_size_val; } -isl::schedule RegisterMemoryManager::RunMatmul(isl::schedule_node root) { - GetActualPromotedSharedTensors(); - auto sch = HoistRegisterMemoryOnMark(root); - if (scop_info_.user_config_.GetEnableTensorCoreUsePoly()) { - root = sch.get_root(); - sch = TileTensorAccordingInterfaceValue(root).get_schedule(); - } - std::string write_name = GetPromotedWriteName(); - std::string marker_name = PROMOTE_REGISTER_TO_GLOBAL; - if (write_name == SHARED_WRITE_ID_NAME) { - marker_name = PROMOTE_REGISTER_TO_SHARED; - } - sch = InsertMarkerForThreadGroup(sch, write_name, marker_name); - return sch; -} - -isl::schedule RegisterMemoryManager::RunReduce(isl::schedule_node root) { - auto sch = root.get_schedule(); - auto res_node = GetRegisterPromotedNode(root); - if (res_node.isa()) { - auto depth = UpdateDepth(res_node); - sch = HoistRegisterMemory(root, depth); - } - return sch; -} - -isl::schedule RegisterMemoryManager::RunElementWise(isl::schedule_node root) { - auto sch = root.get_schedule(); - auto CollectGMLReadWriterFilter = [this](isl::schedule_node node) -> isl::schedule_node { - if (!node.isa()) { - return node; - } - - bool is_all_sets_read_or_write = IsReadOrWriteTensor(node, GML_READ_ID_NAME, GML_WRITE_ID_NAME); - if (!is_all_sets_read_or_write) { - return node; - } - - auto filter = node.as().filter(); - auto filter_set = filter.unwrap(); - bool is_vectorization_tensor = false; - filter_set.range().foreach_set([this, &is_vectorization_tensor](const isl::set &s) -> void { - std::string promoted_tensor = s.get_tuple_name(); - for (auto buffer : scop_info_.analysis_result_.active_buffer_footprints_) { - auto cluster_id = buffer.second.cluster_id; - if (cluster_id.get_name() == promoted_tensor) { - auto cluster = buffer.second.cluster; - auto box_sizes = cluster->GetFixedBoxSizes(); - auto local_size = 1; - for (auto i : box_sizes) { - local_size = local_size * i; - } - if (local_size == 4 || local_size == 8) { - // vectorization mode fp32 or fp16 - is_vectorization_tensor = true; - } - } - } - }); - - if (!is_vectorization_tensor) { - return node; - } - - if (node.n_children() > 0 && node.child(0).isa()) { - node = node.child(0).insert_mark(PROMOTE_VECTORIZATION); - node = node.parent(); - } - return node; - }; - - isl::schedule_node res_node = root; - if (scop_info_.user_config_.GetEnableVectorization()) { - res_node = GetVectorizationPromotedNode(root); - if (res_node.isa()) { - return sch; - } - } else { - res_node = GetRegisterPromotedNode(root); - } - - if (res_node.isa()) { - auto depth = UpdateDepth(res_node); - - sch = HoistRegisterMemory(root, depth); - - if (scop_info_.user_config_.GetEnableVectorization()) { - auto tmp_root = sch.get_root(); - tmp_root = tmp_root.map_descendant_bottom_up(CollectGMLReadWriterFilter); - sch = tmp_root.get_schedule(); - } - } - return sch; -} - -isl::schedule RegisterMemoryManager::Run(isl::schedule sch) { - if (!scop_info_.user_config_.GetLocalTensors().empty()) { - configed_tensors_ = Split(scop_info_.user_config_.GetLocalTensors(), " "); - } - if (scop_info_.user_config_.GetEnableMatmul()) { - local_tensor_c_ = GetMatmulTensorsName(scop_info_)[MATRIX_C]; - } - - sch = InsertContextNode(sch, scop_info_); - - if (!scop_info_.user_config_.GetUseRegisterMemory()) { - return sch; - } - - schedule_ = sch; - auto root = sch.get_root(); - - if (scop_info_.user_config_.GetEnableMatmul()) { - sch = RunMatmul(root); - } else if (scop_info_.analysis_result_.GetUseGpuReduceLib()) { - sch = RunReduce(root); - } else { - sch = RunElementWise(root); - } - - return sch; -} - } // namespace poly } // namespace ir } // namespace akg diff --git a/src/poly/schedule_pass_gpu/register_memory_manager.h b/src/poly/schedule_pass_gpu/register_memory_manager.h index 9dee0478fef06b240899c0c46be67c851b56a079..62b6b2889d99853b1a61e2eaafc5540ee8c12cf1 100644 --- a/src/poly/schedule_pass_gpu/register_memory_manager.h +++ b/src/poly/schedule_pass_gpu/register_memory_manager.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,62 +36,40 @@ class RegisterMemoryManager : public SchedulePass { explicit RegisterMemoryManager(PassInfo &pass_info, ScopInfo &scop_info) : pass_info_(pass_info), scop_info_(scop_info) { pass_name_ = __FUNCTION__; - remain_memory_ = MAX_REGISTER_PER_THREAD_BLOCK * REGISTER_ALLOC_RATIO; }; ~RegisterMemoryManager() {} virtual isl::schedule Run(isl::schedule sch); - isl::schedule HoistRegisterMemoryOnDepth(isl::schedule_node &node, size_t depth); + private: + isl::schedule HoistRegisterMemory(); + isl::schedule_node HoistRegisterMemoryOnMark(const isl::schedule_node &orig_node); + isl::union_map GetPartialSchedule(const isl::schedule_node &node); + isl::schedule_node HoistClusters(const isl::schedule_node &node); - void CreateTensorCluster(const isl::schedule_node &node, const isl::union_map &outer_sch); + void CreateClusterForOperator(const isl::schedule_node &node); void GatherBufferFootprintDefInfo(const isl::schedule_node &node, BufferDefInfo &tensor_info); - bool IsPromote(const TensorFootprintCluster &fp_cluster, const isl::multi_union_pw_aff &partial_sched_mupa, - const isl::multi_union_pw_aff &thread_schedule); - bool UnrolledLoop(const TensorFootprintCluster &fp_cluster); - isl::schedule HoistRegisterMemory(isl::schedule_node root, size_t depth); - - size_t UpdateDepth(const isl::schedule_node &root); - isl::schedule_node GetRegisterPromotedNode(isl::schedule_node &root); - isl::schedule HoistRegisterMemoryOnMark(isl::schedule_node root); isl::schedule_node AdjustConvScheduleTreeStructure(const isl::schedule_node &orig_node); - isl::schedule_node TileTensorAccordingInterfaceValue(isl::schedule_node &root); + isl::schedule_node TileTensorAccordingInterfaceValue(const isl::schedule_node &orig_node); isl::multi_val GetRealTileSizeVal(const isl::schedule_node &node, const std::string &matrix_name, const std::string &matrix_major); - std::string GetPromotedWriteName(); - - void GetActualPromotedSharedTensors(); - - bool IsReadOrWriteBand(isl::schedule_node node); - - isl::schedule_node GetVectorizationPromotedNode(isl::schedule_node &root); + void SetPromotedWriteNameForGemm(std::string &local_tensor_c); + isl::schedule_node InsertMarkerForEmit(const isl::schedule_node &orig_node); - isl::schedule_node PromotedNodeUnderSequence(isl::schedule_node_sequence &node); - - isl::schedule RunMatmul(isl::schedule_node root); - - isl::schedule RunReduce(isl::schedule_node root); - - isl::schedule RunElementWise(isl::schedule_node root); - - bool CheckRAW(std::string &name); - - private: PassInfo &pass_info_; ScopInfo &scop_info_; isl::schedule schedule_; - std::vector configed_tensors_; - bool hoist_compute_local_tensor_{true}; - bool hoist_tensor_all_{false}; - std::string local_tensor_c_; - std::string shared_tensors_; - size_t remain_memory_{0}; + std::string write_name_; + + int band_index_{0}; + OuterBandNode *current_outer_bn_{nullptr}; + std::unordered_set mark_names_; }; } // namespace poly diff --git a/src/poly/schedule_pass_gpu/shared_memory_manager.cc b/src/poly/schedule_pass_gpu/shared_memory_manager.cc index 82f438d3164df1817fce43ed626a08c2d5275ed4..2fd4df661cbfb55705b8982ac480e5653d99b399 100644 --- a/src/poly/schedule_pass_gpu/shared_memory_manager.cc +++ b/src/poly/schedule_pass_gpu/shared_memory_manager.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "poly/schedule_pass_gpu/operator_shared_strategy.h" +#include "poly/create_cluster.h" #include "poly/schedule_pass_gpu/operator_mapping_strategy.h" #include "shared_memory_manager.h" #include "poly/schedule_tree_util.h" @@ -28,41 +28,34 @@ namespace ir { namespace poly { isl::schedule SharedMemoryManager::Run(isl::schedule sch) { - if (!scop_info_.user_config_.GetSharedTensors().empty()) { - configed_tensors_ = Split(scop_info_.user_config_.GetSharedTensors(), " "); - } if (!scop_info_.user_config_.GetUseSharedMemory()) { return sch; } schedule_ = sch; PrepareInfoForPromotion(); schedule_ = HoistSharedMemory(); - if (scop_info_.user_config_.GetEnableMatmul()) { - schedule_ = InsertMarkerForThreadGroup(schedule_, WRITE_ID_NAME, PROMOTE_SHARED_TO_GLOBAL); - } schedule_ = InsertContextNode(schedule_, scop_info_); return schedule_; } void SharedMemoryManager::PrepareInfoForPromotion() { // Update the variable/tensor to share - if (!scop_info_.user_config_.GetSharedTensors().empty()) { - configed_tensors_ = Split(scop_info_.user_config_.GetSharedTensors(), " "); - } - + configed_tensors_ = scop_info_.user_config_.GetSharedTensors(); bank_conflict_ = scop_info_.user_config_.GetEnableBankConflict(); shared_inversed_thread_map_ = scop_info_.user_config_.GetSharedInversedThreadMap(); shared_vector_align_ = scop_info_.user_config_.GetSharedVectorAlign(); - if (scop_info_.user_config_.GetVectorLength() && !scop_info_.user_config_.GetEnableVectorization() && - !scop_info_.user_config_.EnableStitchFusion()) { - scop_info_.user_config_.SetEnableOneDimThread(true); - } unroll_shared_ = scop_info_.user_config_.GetUnrollShared(); } -isl::schedule SharedMemoryManager::HoistSharedMemory() { +isl::schedule_node SharedMemoryManager::HoistSharedMemoryOnMark(const isl::schedule_node &orig_node) { + current_outer_bn_ = scop_info_.analysis_result_.GetOuterBandNode(band_index_); + if (!current_outer_bn_->use_shared_memory) { + return orig_node; + } + CreateClusterForOperator(orig_node); + std::string mark_name = ""; - auto HoistSharedMemoryOnMark = [this, &mark_name](isl::schedule_node node) -> isl::schedule_node { + auto GetMarkNode = [this, &mark_name](isl::schedule_node node) -> isl::schedule_node { if (!node.isa()) { return node; } @@ -79,25 +72,22 @@ isl::schedule SharedMemoryManager::HoistSharedMemory() { return HoistClusters(node.parent()).child(0); }; - auto HoistCoreFunc = [this, HoistSharedMemoryOnMark, - &mark_name](const isl::schedule_node &orig_node) -> isl::schedule_node { - current_outer_bn_ = scop_info_.analysis_result_.GetOuterBandNode(band_index_); - if (!current_outer_bn_->use_shared_memory) { - return orig_node; - } - CreateClusterForOperator(orig_node); - auto node = orig_node; - for (auto name : mark_names_) { - mark_name = name; - node = MapDescendantTopDown(node, HoistSharedMemoryOnMark); - } - node = MapCopiesToThreads(node, unroll_shared_); - return node; - }; + auto node = orig_node; + for (auto name : mark_names_) { + mark_name = name; + node = MapDescendantTopDown(node, GetMarkNode); + } + node = MapCopiesToThreads(node, unroll_shared_); + node = InsertMarkerForRegisterPromotion(node); + node = DeleUselessMarker(node, mark_names_); + return node; +} + +isl::schedule SharedMemoryManager::HoistSharedMemory() { isl::schedule_node node = GetOuterBand(schedule_.root()); if (node.isa()) { - node = HoistCoreFunc(node); + node = HoistSharedMemoryOnMark(node); } else { int number = static_cast(node.n_children()); for (int i = 0, current_band_index = 0; i < number; ++i) { @@ -107,7 +97,8 @@ isl::schedule SharedMemoryManager::HoistSharedMemory() { remain_memory_ = akg::common::SHARED_MEMORY_SIZE; mark_names_.clear(); band_index_ = current_band_index; - node = HoistCoreFunc(promotion_node).ancestor(2); + node = HoistSharedMemoryOnMark(promotion_node); + node = node.parent().parent(); ++current_band_index; } } @@ -116,15 +107,13 @@ isl::schedule SharedMemoryManager::HoistSharedMemory() { } void SharedMemoryManager::CreateClusterForOperator(const isl::schedule_node &node) { + SharedCreateCluster create_cluster(scop_info_, band_index_); if (scop_info_.analysis_result_.GetUseGpuReduceLib()) { // reduce operator - is_reduce_ = true; mark_names_.emplace(PROMOTE_GLOBAL_TO_SHARED); - ReduceSharedStrategy reduce_op(scop_info_, mark_names_, band_index_); - reduce_op.CreateClusterList(node); + create_cluster.CreateClusterListForReduce(node, mark_names_); } else if (scop_info_.user_config_.GetEnableMatmul()) { // matmul operator - is_matmul_ = true; remain_memory_ = akg::common::ADVANCED_SHARED_MEMORY_SIZE; auto tensor_c_name = GetMatmulTensorsName(scop_info_)[MATRIX_C]; @@ -133,13 +122,49 @@ void SharedMemoryManager::CreateClusterForOperator(const isl::schedule_node &nod } mark_names_.emplace(PROMOTE_GLOBAL_TO_SHARED_AB); - BatchMatmulSharedStrategy matmul_op(scop_info_, mark_names_, band_index_); - matmul_op.CreateClusterList(node); + create_cluster.CreateClusterListForGemm(node, mark_names_); } else { mark_names_.emplace(PROMOTE_GLOBAL_TO_SHARED); - OperatorSharedStrategy other_op(scop_info_, mark_names_, band_index_); - other_op.CreateClusterList(node); + create_cluster.CreateClusterListForElementWise(node, mark_names_); + } +} + +isl::schedule_node SharedMemoryManager::InsertMarkerForRegisterPromotion(const isl::schedule_node &orig_node) { + isl::schedule_node hoist_register_node = orig_node; + + if (scop_info_.user_config_.GetEnableMatmul()) { + if (mark_names_.find(PROMOTE_GLOBAL_TO_SHARED_C) != mark_names_.end()) { + hoist_register_node = orig_node.child(0).insert_mark(PROMOTE_SHARED_TO_REGISTER_C); + } + hoist_register_node = InsertMarkerForThreadGroup(hoist_register_node, WRITE_ID_NAME, PROMOTE_SHARED_TO_GLOBAL); + return ReplaceMarker(hoist_register_node, PROMOTE_GLOBAL_TO_SHARED_AB, SHARED_MEM_PROMOTED_COMPLETE); + } else if (current_outer_bn_->enable_vectorization) { + return orig_node; } + + size_t start_depth = orig_node.get_tree_depth(); + + orig_node.foreach_descendant_top_down([&hoist_register_node, this](const isl::schedule_node &node) -> bool { + if (node.isa()) { + auto sequence_node = node.as(); + if (sequence_node.parent().isa()) { + hoist_register_node = sequence_node.parent().insert_mark(PROMOTE_GLOBAL_TO_REGISTER); + return false; + } + } + + if (node.isa()) { + auto mark_node = node.as(); + if (mark_node.get_id().get_name() == THREAD_MARKER) { + hoist_register_node = mark_node.insert_mark(PROMOTE_GLOBAL_TO_REGISTER); + return false; + } + } + return true; + }); + + hoist_register_node = hoist_register_node.ancestor(hoist_register_node.get_tree_depth() - start_depth); + return hoist_register_node; } isl::schedule_node SharedMemoryManager::MapCopiesToThreads(const isl::schedule_node &orig_node, bool unroll) { @@ -154,11 +179,14 @@ isl::schedule_node SharedMemoryManager::MapCopiesToThreads(const isl::schedule_n } auto band_node = GetCanMappingNode(node); - auto thread_cfg = scop_info_.user_config_.GetThreadConfig(); - auto mapping_cfg = thread_cfg; + if (!scop_info_.user_config_.EnableStitchFusion() && !current_outer_bn_->enable_vectorization && + scop_info_.user_config_.GetVectorLength()) { + scop_info_.user_config_.SetEnableOneDimThread(true); + } + if (scop_info_.user_config_.GetEnableOneDimThread()) { mapping_cfg = GetCurrentConfig(band_node); @@ -216,7 +244,7 @@ isl::schedule_node SharedMemoryManager::MapCopiesToThreads(const isl::schedule_n others_op.SetRequiredMappingCfg(band_node); // Map band under thread_root from inner dim to outer dim. band_node = others_op.MapDimToThreadsBlocks(band_node); - if (is_reduce_) { + if (scop_info_.analysis_result_.GetUseGpuReduceLib()) { std::string atomic_type = InAtomicTensors(node); auto InsertAtomicMarker = [atomic_type, this](isl::schedule_node atomic_node) -> isl::schedule_node { if (atomic_type != "" && atomic_node.has_children() && atomic_node.child(0).isa()) { @@ -325,7 +353,7 @@ void SharedMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_node isl::id tensor_id = tensor_info.tensor_id; Type type = scop_info_.GetDtypeOf(tensor_id); - if (is_matmul_ && tensor_id.get_name() == GetMatmulTensorsName(scop_info_)[MATRIX_C]) { + if (scop_info_.user_config_.GetEnableMatmul() && tensor_id.get_name() == GetMatmulTensorsName(scop_info_)[MATRIX_C]) { sizes.back() += 8; } @@ -356,50 +384,9 @@ void SharedMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_node } isl::schedule_node SharedMemoryManager::HoistClusters(const isl::schedule_node &node) { - auto partial_sched_mupa = ShortScheduleMupa(schedule_.root(), node); - - std::vector buffer_def_infos_origin; - std::vector buffer_def_infos_temp; - auto origin_binds = scop_info_.user_config_.GetOriginBind(); - std::unordered_set tensor_name; - - for (auto i : origin_binds) { - if (!i.first.defined()) continue; - tensor_name.insert(i.first->op->name); - } - + auto res_node = node; for (size_t index = 0; index < scop_info_.analysis_result_.buffer_def_infos_.size(); index++) { BufferDefInfo &buffer_info = scop_info_.analysis_result_.buffer_def_infos_[index]; - if (tensor_name.count(buffer_info.tensor_id.get_name())) { - buffer_def_infos_origin.push_back(buffer_info); - } else { - buffer_def_infos_temp.push_back(buffer_info); - } - } - - auto res_node = node; - if (scop_info_.analysis_result_.GetTensorOfTensor()) { - SharedPromotion(buffer_def_infos_temp, res_node, node, partial_sched_mupa); - SharedPromotion(buffer_def_infos_origin, res_node, node, partial_sched_mupa); - - scop_info_.analysis_result_.buffer_def_infos_.clear(); - for (auto &b : buffer_def_infos_temp) { - scop_info_.analysis_result_.buffer_def_infos_.push_back(b); - } - for (auto &b : buffer_def_infos_origin) { - scop_info_.analysis_result_.buffer_def_infos_.push_back(b); - } - } else { - SharedPromotion(scop_info_.analysis_result_.buffer_def_infos_, res_node, node, partial_sched_mupa); - } - return res_node; -} - -void SharedMemoryManager::SharedPromotion(std::vector &bd, isl::schedule_node &res_node, - const isl::schedule_node &node, - const isl::multi_union_pw_aff &partial_sched_mupa) { - for (size_t index = 0; index < bd.size(); index++) { - BufferDefInfo &buffer_info = bd[index]; auto fp_cluster = buffer_info.GetFootPrintClusterGPU(node); if ((fp_cluster == nullptr || !fp_cluster->foot_print_.box.is_valid())) { continue; @@ -422,32 +409,24 @@ void SharedMemoryManager::SharedPromotion(std::vector &bd, isl::s auto approximation_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies()); size_t byte = Bytes(id); size_t memory_requirement = approximation_size * byte; - bool use_reuse_filter = true; - if (InAtomicTensors(buffer_info.tensor_id.name()) || InReduceTensors(buffer_info.tensor_id.name()) || is_matmul_ || - current_outer_bn_->template_type == Template::TRANSPOSE_OP) { - use_reuse_filter = false; - } - bool is_injective = !ReuseTensorCluster(*fp_cluster, partial_sched_mupa); - if (memory_requirement < remain_memory_) { - bool need_shared_memory = !use_reuse_filter || !is_injective || CoalescingAccessWay(res_node, *fp_cluster); - if (!need_shared_memory) { - continue; - } - GatherBufferFootprintDefInfo(res_node, buffer_info); - auto dst_id = buffer_info.dst_tensor_id; - res_node = HoistToBlockThreadMemory(res_node, GpuMemType::SHARED, id, dst_id, *(fp_cluster), true); - remain_memory_ -= memory_requirement; - - // collect active_buffer_footprints_ info for codegen - auto out_schedule = LocalSchedule(res_node); - auto active_domains = CollectDomain(res_node); - scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair( - active_domains, - BufferedFootPrintInfo{std::shared_ptr(std::move(fp_cluster)), out_schedule, dst_id})); - buffer_info.find_buffer = true; + if (memory_requirement >= remain_memory_) { + continue; } + GatherBufferFootprintDefInfo(res_node, buffer_info); + auto dst_id = buffer_info.dst_tensor_id; + res_node = HoistToBlockThreadMemory(res_node, GpuMemType::SHARED, id, dst_id, *(fp_cluster), true); + remain_memory_ -= memory_requirement; + + // collect active_buffer_footprints_ info for codegen + auto out_schedule = LocalSchedule(res_node); + auto active_domains = CollectDomain(res_node); + scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair( + active_domains, + BufferedFootPrintInfo{std::shared_ptr(std::move(fp_cluster)), out_schedule, dst_id})); + buffer_info.find_buffer = true; } + return res_node; } isl::schedule_node SharedMemoryManager::HoistToBlockThreadMemory(isl::schedule_node &tree, GpuMemType type, @@ -465,42 +444,6 @@ isl::schedule_node SharedMemoryManager::HoistToBlockThreadMemory(isl::schedule_n return res_node; } -bool SharedMemoryManager::CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster) { - isl::union_map original = cluster.OrigianlAccessRelations(); - size_t tensor_dim = cluster.foot_print_.GetBoxDim(); - std::vector thread_marker = CollectFnNode(IsThreadMappedMark, schedule_.root()); - for (auto item : thread_marker) { - if (!(item.isa()) && !(item.has_children()) && - !(item.child(0).isa())) { - continue; - } - isl::schedule_node thread_filter = item.child(0); - if (!thread_filter.has_children()) { - continue; - } - isl::schedule_node thread_band = thread_filter.child(0); - if (!thread_band.has_children()) { - continue; - } - isl::schedule_node inner_band = thread_band.child(0); - size_t num_mapped_thread = inner_band.schedule_depth() - thread_band.schedule_depth(); - if (num_mapped_thread == 0) { - continue; - } - size_t inner_depth = inner_band.schedule_depth(); - auto active_domains = CollectDomain(thread_band); - auto local_access = original.intersect_domain(active_domains); - auto schedule = ShortSchedule(inner_band); - auto schedule_access = local_access.apply_domain(schedule); - for (auto access : schedule_access.get_map_list()) { - if (!IsSubsetForIncreaseDim(access, tensor_dim - 1, inner_depth - 1)) { - return true; - } - } - } - return false; -} - std::string SharedMemoryManager::InAtomicTensors(isl::schedule_node &node) { if (!node.isa()) { return ""; diff --git a/src/poly/schedule_pass_gpu/shared_memory_manager.h b/src/poly/schedule_pass_gpu/shared_memory_manager.h index dbacf2e1fd0695a7acb85a04a61b1a8290b38630..81d0cfbe074e276bb7b945ecb91cad3291961f6d 100644 --- a/src/poly/schedule_pass_gpu/shared_memory_manager.h +++ b/src/poly/schedule_pass_gpu/shared_memory_manager.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,7 @@ class SharedMemoryManager : public SchedulePass { virtual isl::schedule Run(isl::schedule sch); + private: void PrepareInfoForPromotion(); // create cluster @@ -43,16 +44,15 @@ class SharedMemoryManager : public SchedulePass { // promotion core function isl::schedule HoistSharedMemory(); + isl::schedule_node HoistSharedMemoryOnMark(const isl::schedule_node &orig_node); void GatherBufferFootprintDefInfo(const isl::schedule_node &node, BufferDefInfo &tensor_info); isl::schedule_node HoistClusters(const isl::schedule_node &node); - void SharedPromotion(std::vector &bd, isl::schedule_node &res_node, const isl::schedule_node &node, - const isl::multi_union_pw_aff &partial_sched_mupa); isl::schedule_node HoistToBlockThreadMemory(isl::schedule_node &tree, GpuMemType type, const isl::id &tensor_id, const isl::id &dst_tensor_id, TensorFootprintCluster &cluster, bool force_last_extension_odd); size_t Bytes(const isl::id tensor_id); - bool CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster); + isl::schedule_node InsertMarkerForRegisterPromotion(const isl::schedule_node &orig_node); // Other optimization void OptimizeSharedDimension(std::vector &sizes, Type type); @@ -69,10 +69,9 @@ class SharedMemoryManager : public SchedulePass { bool InReduceTensors(const std::string &name); std::string AtomicMarker(const std::string &type); - private: ScopInfo &scop_info_; isl::schedule schedule_; - std::vector configed_tensors_; + std::unordered_set configed_tensors_; bool bank_conflict_{false}; bool shared_inversed_thread_map_{false}; int shared_vector_align_{0}; diff --git a/src/poly/schedule_tree_util.cc b/src/poly/schedule_tree_util.cc index c70ffac038e5674d3c7169f58793db226e02e210..8ac754ec760dacee561a492ed821d7c74235110d 100644 --- a/src/poly/schedule_tree_util.cc +++ b/src/poly/schedule_tree_util.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -222,27 +222,26 @@ std::vector BandsSplitAfterDepth(const std::vector isl::schedule_node { +isl::schedule_node InsertMarkerForThreadGroup(const isl::schedule_node &orig_node, const std::string &filter_name, + const std::string &marker_name) { + auto GetPromotedWriteFilter = [filter_name, marker_name](isl::schedule_node node) -> isl::schedule_node { if (!node.isa()) { return node; } isl::union_set uset = node.as().get_filter(); - bool is_gm_write = false; - uset.foreach_set([&is_gm_write, write_name](isl::set s) { - if (s.get_tuple_name() == write_name) { - is_gm_write = true; + bool is_gm_filter = false; + uset.foreach_set([&is_gm_filter, filter_name](isl::set s) { + if (s.get_tuple_name() == filter_name) { + is_gm_filter = true; } }); - if (is_gm_write && node.has_parent() && node.parent().isa()) { + if (is_gm_filter && node.has_parent() && node.parent().isa()) { node = node.child(0).insert_mark(marker_name); node = node.parent(); } return node; }; - auto final_sch = sch.get_root().map_descendant_bottom_up(GetPromotedWriteFilter).schedule(); - return final_sch; + return orig_node.map_descendant_bottom_up(GetPromotedWriteFilter); } std::string GetMarkerName(const isl::schedule_node &node, std::string find_name) { @@ -961,6 +960,43 @@ isl::schedule_node GetMarkerNode(const isl::schedule_node &orig_node, const std: return node; } +isl::schedule_node DeleUselessMarker(const isl::schedule_node &orig_node, + const std::unordered_set &mark_names) { + auto DeleteMarker = [mark_names](isl::schedule_node node) -> isl::schedule_node { + if (!node.isa()) { + return node; + } + + auto marker_node = node.as(); + std::string marker_str = marker_node.get_id().get_name(); + if (mark_names.find(marker_str) != mark_names.end()) { + return node.del(); + } + + return node; + }; + return orig_node.map_descendant_bottom_up(DeleteMarker); +} + +isl::schedule_node ReplaceMarker(const isl::schedule_node &orig_node, const std::string &orig_name, + const std::string &replaced_name) { + auto DeleteMarker = [orig_name, replaced_name](isl::schedule_node node) -> isl::schedule_node { + if (!node.isa()) { + return node; + } + + auto marker_node = node.as(); + std::string marker_str = marker_node.get_id().get_name(); + if (marker_str == orig_name) { + node = node.del(); + return node.insert_mark(replaced_name); + } + + return node; + }; + return orig_node.map_descendant_bottom_up(DeleteMarker); +} + } // namespace poly } // namespace ir } // namespace akg diff --git a/src/poly/schedule_tree_util.h b/src/poly/schedule_tree_util.h index 729cec13e56eacbb630d1445f571b88ed67bbc75..f0b9b2302d0ae0d40448fb5e1ea2cb88093500bb 100644 --- a/src/poly/schedule_tree_util.h +++ b/src/poly/schedule_tree_util.h @@ -1,5 +1,5 @@ /** - * Copyright 2020-2021 Huawei Technologies Co., Ltd + * Copyright 2020-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,7 +103,7 @@ isl::multi_union_pw_aff MapDomainAllWithType(const isl::schedule_node &node, Map isl::map CreateMapIncreaseDim(isl::space space, unsigned dim); bool IsSubsetForIncreaseDim(const isl::map access, size_t tensor_dim, size_t node_dim); int GetLastAxis(const isl::schedule_node node, isl::union_map original_access, - std::unordered_set skip_tensors); + std::unordered_set skip_tensors = {}); std::vector CollectFnNode(const std::function &fn, const isl::schedule_node &root); @@ -114,8 +114,8 @@ isl::schedule_node UnrollByMarkOptions(isl::schedule_node &node, uint64_t unroll isl::map GetExtensionSpace(const isl::schedule_node &node, const isl::id &id); isl::schedule_node InsertExtensionNodeBeforeOrAfter(const isl::schedule_node &node, const isl::id &id, bool before); -isl::schedule InsertMarkerForThreadGroup(const isl::schedule &sch, const std::string &write_name, - const std::string &marker_name); +isl::schedule_node InsertMarkerForThreadGroup(const isl::schedule_node &orig_node, const std::string &filter_name, + const std::string &marker_name); std::string GetMarkerName(const isl::schedule_node &node, std::string find_name); isl::union_set GetMappingFilterInfo(const isl::schedule_node node, MappingCfg *mapping_cfg, @@ -142,6 +142,11 @@ isl::schedule_node CheckMapSizeAndApplyTile(const isl::schedule_node &mapping_ro const std::vector &additional_tile_size = {}); isl::multi_union_pw_aff GetMappingPartialSchedule(const isl::schedule_node_band &node, const bool is_promotion = false); isl::schedule_node GetMarkerNode(const isl::schedule_node &orig_node, const std::string &marker_name); +isl::schedule_node DeleUselessMarker(const isl::schedule_node &orig_node, + const std::unordered_set &mark_names); + +isl::schedule_node ReplaceMarker(const isl::schedule_node &orig_node, const std::string &orig_name, + const std::string &replaced_name); } // namespace poly } // namespace ir diff --git a/src/poly/scop_info.cc b/src/poly/scop_info.cc index 05d65ec0c8c42a04ee7dc83be9036387e6a18e81..f98acbcf71b9d1bf3c0ffd5c4747e6b81ec65e56 100644 --- a/src/poly/scop_info.cc +++ b/src/poly/scop_info.cc @@ -1,5 +1,5 @@ /** - * Copyright 2019-2021 Huawei Technologies Co., Ltd + * Copyright 2019-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ namespace akg { namespace ir { namespace poly { constexpr int kInvalidIntAttr = -1; +constexpr int CONV_INPUT_DIM = 5; +constexpr int INT_BITS = 32; Expr kInvalidExprAttr; CubeInfo::~CubeInfo() { @@ -162,12 +164,13 @@ bool ScopInfo::IsElewiseVMStmt(const isl::id &id) const { bool ScopInfo::MayWriteAfterRead(const std::string &name) const { std::map def; std::map use; + const int idstr_size = 2; for (auto a : analysis_result_.GetWrites().get_map_list()) { isl::id id = a.domain().unwrap().domain().get_tuple_id(); std::string idstr = id.get_name(); if (a.get_tuple_id(isl_dim_out).get_name() != name) continue; - CHECK_GE(idstr.size(), 2); - idstr = idstr.substr(2, idstr.size()); + CHECK_GE(idstr.size(), idstr_size); + idstr = idstr.substr(idstr_size, idstr.size()); int ref = static_cast(WrappedStrtol(idstr)); def[ref] = id; } @@ -175,8 +178,8 @@ bool ScopInfo::MayWriteAfterRead(const std::string &name) const { isl::id id = a.domain().unwrap().domain().get_tuple_id(); std::string idstr = id.get_name(); if (a.get_tuple_id(isl_dim_out).get_name() != name) continue; - CHECK_GE(idstr.size(), 2); - idstr = idstr.substr(2, idstr.size()); + CHECK_GE(idstr.size(), idstr_size); + idstr = idstr.substr(idstr_size, idstr.size()); int ref = static_cast(WrappedStrtol(idstr)); use[ref] = id; } @@ -408,7 +411,7 @@ Type ScopInfo::GetDtypeOf(const std::string &tensor_name) const { } } CHECK(false) << " no such tensor in binds: " << tensor_name; - return Int(32); + return Int(INT_BITS); } Type ScopInfo::GetDtypeOf(const isl::ast_expr &e) const { @@ -416,7 +419,7 @@ Type ScopInfo::GetDtypeOf(const isl::ast_expr &e) const { isl::id var = op.get_arg(0).as().get_id(); return GetDtypeOf(var); } - return Int(32); + return Int(INT_BITS); } std::vector ScopInfo::GetShapeOf(const std::string &tensor_name) const { @@ -625,8 +628,8 @@ void CubeInfo::SetConvMNKInfo() { auto conv_mnk_dims = GetConvMNKDims(); if (user_config_.GetIsDynamic()) { for (const auto &dim : conv_mnk_dims) { - fractal_int_info_[dim.axis] = IntImm::make(Int(32), dim.c1_tiling_size); - attr_info_.Set(dim.axis, IntImm::make(Int(32), dim.c1_tiling_size)); + fractal_int_info_[dim.axis] = IntImm::make(Int(INT_BITS), dim.c1_tiling_size); + attr_info_.Set(dim.axis, IntImm::make(Int(INT_BITS), dim.c1_tiling_size)); } } else { const int c0_size = 16; @@ -651,7 +654,8 @@ void UserConfig::CollectParams() { if (imm->value == 1) { if (const auto fd = add->a.as()) { if (const auto denominator = fd->b.as()) { - if (denominator->value == 2) { + const int denominator_num = 2; + if (denominator->value == denominator_num) { return CanonicalSimplify(air::ir::Div::make((fd->a + fd->b), fd->b)); } } @@ -743,6 +747,11 @@ void UserConfig::RegisterParam(const Expr &expr) { params_rev_map_.emplace(name, expr); } +std::unordered_set UserConfig::GetSplitTensors(const std::string &tensor_name) { + auto split_tensors = Split(tensor_name, SPACE_PATTERN); + return std::unordered_set(split_tensors.begin(), split_tensors.end()); +} + MappingCfg *UserConfig::GetThreadConfig() { bool enable_replace_cfg = (this->enable_one_dim_thread_ || this->enable_tensor_core_use_poly_); if (!enable_replace_cfg) { @@ -831,7 +840,7 @@ void CubeInfo::UpdateFractalIntFirstInfo(bool is_conv_backprop_filter, void CubeInfo::UpdateFractalIntLastInfo(std::vector filter_fp_cluster_size) { if (IsConvBackpropInput()) { - CHECK_EQ(filter_fp_cluster_size.size(), 4); + CHECK_EQ(filter_fp_cluster_size.size(), CONV_INPUT_DIM - 1); // conv_backprop_input filter: [ko, no, ni, ki] int64_t kh = ExtractIntFromAttrs(ATTR_CONV_KERNEL_H); int64_t kw = ExtractIntFromAttrs(ATTR_CONV_KERNEL_W); @@ -840,13 +849,13 @@ void CubeInfo::UpdateFractalIntLastInfo(std::vector filter_fp_cluster_si fractal_int_info_[ATTR_CONV_N_INNER] = (int64_t)filter_fp_cluster_size[2]; } else if (IsConvBackpropFilter()) { - CHECK_EQ(filter_fp_cluster_size.size(), 5); + CHECK_EQ(filter_fp_cluster_size.size(), CONV_INPUT_DIM); // conv_backprop_filter filter: [batch, no, mo, ni, mi] fractal_int_info_[ATTR_CONV_TILE_M] = (int64_t)filter_fp_cluster_size[1]; fractal_int_info_[ATTR_CONV_M_INNER] = (int64_t)filter_fp_cluster_size[3]; fractal_int_info_[ATTR_CONV_GMM_M] = (int64_t)filter_fp_cluster_size[1] * filter_fp_cluster_size[3]; } else { - CHECK_EQ(filter_fp_cluster_size.size(), 4); + CHECK_EQ(filter_fp_cluster_size.size(), CONV_INPUT_DIM - 1); // conv_forward filter: [ko, no, ni, ki] fractal_int_info_[ATTR_CONV_TILE_CO] = (int64_t)filter_fp_cluster_size[1]; fractal_int_info_[ATTR_CONV_TILE_N] = (int64_t)filter_fp_cluster_size[1]; @@ -887,25 +896,26 @@ void CubeInfo::UpdateFractalIntFirstInfoConvBackpropFilter(std::vector i fractal_int_info_[ATTR_CONV_TILE_CO] = (int64_t)fractal_fp_cluster_size[conv_tile_co]; - CHECK_EQ(im2col_fp_cluster_size.size(), 6); + CHECK_EQ(im2col_fp_cluster_size.size(), CONV_INPUT_DIM + 1); fractal_int_info_[ATTR_CONV_GMM_K] = (int64_t)im2col_fp_cluster_size[conv_gmm_k]; } void CubeInfo::UpdateFractalIntFirstInfoConvForward(std::vector im2col_fp_cluster_size, std::vector fractal_fp_cluster_size) { - CHECK_EQ(fractal_fp_cluster_size.size(), 5); + CHECK_EQ(fractal_fp_cluster_size.size(), CONV_INPUT_DIM); fractal_int_info_[ATTR_CONV_BATCH] = (int64_t)fractal_fp_cluster_size[0]; fractal_int_info_[ATTR_CONV_TILE_M] = (int64_t)fractal_fp_cluster_size[1]; fractal_int_info_[ATTR_CONV_TILE_K] = (int64_t)fractal_fp_cluster_size[2]; fractal_int_info_[ATTR_CONV_M_INNER] = (int64_t)fractal_fp_cluster_size[3]; fractal_int_info_[ATTR_CONV_K_INNER] = (int64_t)fractal_fp_cluster_size[4]; - CHECK_EQ(im2col_fp_cluster_size.size(), 6); + CHECK_EQ(im2col_fp_cluster_size.size(), CONV_INPUT_DIM + 1); fractal_int_info_[ATTR_CONV_GMM_M] = (int64_t)im2col_fp_cluster_size[1]; } void CubeInfo::UpdateFractalIntInfoConvForward(int isolate_idx) { - auto C0_SIZE = IntImm::make(Int(32), 16); + const int c0_size = 16; + auto C0_SIZE = IntImm::make(Int(INT_BITS), c0_size); fractal_int_info_[ATTR_CONV_TILE_N] = floordiv(model_->get_co_isolate_info(isolate_idx).inner, C0_SIZE); Expr m = model_->get_h_win_isolate_info(isolate_idx).inner * model_->get_w_win_isolate_info(isolate_idx).inner; @@ -922,16 +932,16 @@ void CubeInfo::UpdateFractalIntInfoConvForward(int isolate_idx) { } } else { auto tile_h = ExtractExprFromAttrs(ATTR_CONV_TILE_H); - tile_h = tile_h.get() ? tile_h : IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_TILE_H)); + tile_h = tile_h.get() ? tile_h : IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_TILE_H)); if (!Equal(tile_h, -1)) fractal_int_info_[ATTR_CONV_TILE_H] = tile_h; auto tile_w = ExtractExprFromAttrs(ATTR_CONV_TILE_W); - tile_w = tile_w.get() ? tile_w : IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_TILE_W)); + tile_w = tile_w.get() ? tile_w : IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_TILE_W)); if (!Equal(tile_w, -1)) fractal_int_info_[ATTR_CONV_TILE_W] = tile_w; - fractal_int_info_[ATTR_CONV_KERNEL_H] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_KERNEL_H)); - fractal_int_info_[ATTR_CONV_STRIDE_H] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_STRIDE_H)); - fractal_int_info_[ATTR_CONV_KERNEL_W] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_KERNEL_W)); - fractal_int_info_[ATTR_CONV_STRIDE_W] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_STRIDE_W)); + fractal_int_info_[ATTR_CONV_KERNEL_H] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_KERNEL_H)); + fractal_int_info_[ATTR_CONV_STRIDE_H] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_STRIDE_H)); + fractal_int_info_[ATTR_CONV_KERNEL_W] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_KERNEL_W)); + fractal_int_info_[ATTR_CONV_STRIDE_W] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_STRIDE_W)); } } @@ -1597,16 +1607,17 @@ std::string TensorMarkTag(MemType mem_type, MemFlow mem_flow) { * Now REALIZE_C1/REALIZE_C0/REALIZE_BUF mark_tag is equal to its MemType. * For mem_type is DDR, mark_tag is empty string "". * */ + const int mem_flow_size = 3; switch (mem_type) { case MemType::C1_: - if (mem_flow.size() == 3 && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::C1_ && - mem_flow[2] == MemType::BUF_C1_) + if (mem_flow.size() == mem_flow_size && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::C1_ && + mem_flow[mem_flow_size - 1] == MemType::BUF_C1_) return REALIZE_C1BUFC1; return REALIZE_C1; case MemType::BUF_: // ordinary conv condition no fusion - if (mem_flow.size() == 3 && mem_flow[0] == MemType::DDR && mem_flow[1] == mem_type && - mem_flow[2] == MemType::C0C_) + if (mem_flow.size() == mem_flow_size && mem_flow[0] == MemType::DDR && mem_flow[1] == mem_type && + mem_flow[mem_flow_size - 1] == MemType::C0C_) return REALIZE_C0; return REALIZE_BUF; case MemType::C0A_: @@ -1618,7 +1629,8 @@ std::string TensorMarkTag(MemType mem_type, MemFlow mem_flow) { case MemType::BUF_C0_: return REALIZE_BUFC0; case MemType::BUF_C1_: - if (mem_flow.size() == 2 && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::BUF_C1_) return REALIZE_C1; + if (mem_flow.size() == (mem_flow_size - 1) && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::BUF_C1_) + return REALIZE_C1; return REALIZE_BUFC1; case MemType::DDR: return ""; diff --git a/src/poly/scop_info.h b/src/poly/scop_info.h index 0d9d9760bbb9dac062b7dddc594e3bac2ad2340e..41b9c4d3f584ca499ae4da58d95b36c78257ccb4 100644 --- a/src/poly/scop_info.h +++ b/src/poly/scop_info.h @@ -312,8 +312,8 @@ class UserConfig { ParseIntAttr(attrs, "csr_thread_num", &csr_thread_num_); ParseIntAttr(attrs, "csr_avg_row", &csr_avg_row_); ParseStringAttr(attrs, "shared_memory_tensors", &shared_tensors_); + ParseStringAttr(attrs, "register_memory_tensors", ®ister_tensors_); ParseStringAttr(attrs, "reduce_lib_type", &reduce_lib_type_); - ParseStringAttr(attrs, "local_memory_tensors", &local_tensors_); ParseVectorLengthAttr(attrs, "vector_length", &vector_length_); } else if (GetTarget() == TARGET_CPU) { ParseVectorLengthAttr(attrs, "vector_length", &vector_length_, false); @@ -542,10 +542,13 @@ class UserConfig { bool GetUseSharedMemory() const { return use_shared_memory_; } void SetGetUseSharedMemory(bool use_shared_memory) { use_shared_memory_ = use_shared_memory; } void SetGetUseRegisterMemory(bool use_register_memory) { use_register_memory_ = use_register_memory; } - void SetSharedTensors(std::string shared_tensors) { shared_tensors_ = shared_tensors; } - std::string GetSharedTensors() { return shared_tensors_; } + + std::unordered_set GetSplitTensors(const std::string &tensor_name); + void RecordSharedTensors(const std::string &tensor_name) { shared_tensors_ += (SPACE_PATTERN + tensor_name); } + std::unordered_set GetSharedTensors() { return GetSplitTensors(shared_tensors_); } + std::unordered_set GetRegisterTensors() { return GetSplitTensors(register_tensors_); } + std::string GetReduceLibType() { return reduce_lib_type_; } - std::string GetLocalTensors() { return local_tensors_; } void SetEnableBankConflict(bool enable_bank_conflict) { enable_bank_conflict_ = enable_bank_conflict; } bool GetEnableBankConflict() { return enable_bank_conflict_; } int GetVectorLength() { return vector_length_; } @@ -709,16 +712,16 @@ class UserConfig { bool use_shared_memory_{true}; // shared memory tensor list std::string shared_tensors_; + // local memory tensor list + std::string register_tensors_; // reduce lib type, for now, there are two selection // one is named "origin" // one is named "paris" std::string reduce_lib_type_{"origin"}; - // local memory tensor list - std::string local_tensors_; // vectorization int vector_length_{0}; bool enable_one_dim_thread_{false}; - bool enable_vectorization_{false}; + bool enable_vectorization_{true}; // tiling config std::string b_dim_; @@ -938,6 +941,7 @@ class AnalysisResult { bool enable_vectorization{false}; bool is_thread_tile{false}; bool is_block_tile{false}; + std::set coalesced_access_tensors; }; void RecordWrites(const isl::union_map &writes) { writes_ = writes; } diff --git a/src/poly/scop_make_schedule_tree.cc b/src/poly/scop_make_schedule_tree.cc index f53125cf2aef87162825c9e6ef819f64795ad2a0..40eef33202a3f246cfa5ced8ba312e673a5d707f 100644 --- a/src/poly/scop_make_schedule_tree.cc +++ b/src/poly/scop_make_schedule_tree.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021 Huawei Technologies Co., Ltd + * Copyright 2021-2022 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/poly/tiling/tiling_strategy_manager_gpu.cc b/src/poly/tiling/tiling_strategy_manager_gpu.cc index 74c0cd20854ee35a7eb151b18bb5a2c98e2f9884..b5679d39d382dbe5b953818c177d13b6c8bc8f80 100644 --- a/src/poly/tiling/tiling_strategy_manager_gpu.cc +++ b/src/poly/tiling/tiling_strategy_manager_gpu.cc @@ -924,10 +924,12 @@ void GpuStrategy::AddGpuConstraint() { } is_first = false; - } - if (!((template_ == Template::MATMUL || template_ == Template::CONV) && - analyzer_->scop_info_.user_config_.GetEnableTensorCore())) { + if ((template_ == Template::MATMUL || template_ == Template::CONV) && + analyzer_->scop_info_.user_config_.GetEnableTensorCore()) { + continue; + } + analyzer_->ForEachAxisTopDown([this](TileAxis *axis) { if (axis == analyzer_->RootAxis()) { return; @@ -980,6 +982,9 @@ void GpuStrategy::VectorizationSpeedup() { } bool GpuStrategy::IsVectorized() { + if (!analyzer_->scop_info_.user_config_.GetEnableVectorization()) { + return false; + } auto reads_access = analyzer_->scop_info_.analysis_result_.GetReads().domain_factor_domain(); auto write_access = analyzer_->scop_info_.analysis_result_.GetWrites().domain_factor_domain(); auto original_access = reads_access.unite(write_access); @@ -1058,7 +1063,6 @@ void GpuStrategy::CheckVectorizationForElemwiseOp() { analyzer_->scop_info_.user_config_.SetVectorLength(quadruple_warp_size_); } current_outer_bn_->enable_vectorization = true; - analyzer_->scop_info_.user_config_.SetEnableVectorization(true); } void GpuStrategy::ThreadConfiguration(ReduceDirection direct, bool use_lib) { diff --git a/tests/st/ops/gpu/test_mindtricks.py b/tests/st/ops/gpu/test_mindtricks.py index e0415684d737851d004d2b4d96dd0c312296d2b6..db9a94ac57691ebbaa3d53ab435967dbccd033ab 100644 --- a/tests/st/ops/gpu/test_mindtricks.py +++ b/tests/st/ops/gpu/test_mindtricks.py @@ -62,9 +62,10 @@ composite_targets = { "Fused_AddN_fusion_9584919353229493170", "Fused_Cast_BiasAdd_Gelu_fusion_7719078727474100806", "Fused_Cast_BiasAdd_GkDropout_tuple_getitem_TensorAdd_fusion_13282325956852925231", - "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1039082044534023692", - "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1545859458890067484", - "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1976850843332086880", + # Note: swizzle pass requires constant tensor must be promoted. + # "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1039082044534023692", + # "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1545859458890067484", + # "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1976850843332086880", "Fused_GkDropout_2353362030752466006", "Fused_Transpose_split_18185609042134105765", ],