diff --git a/python/akg/composite/split_stitch.py b/python/akg/composite/split_stitch.py
index 477b594274fa5757758f2b7f8df8a245ff7f1653..b69ffa1fe102471657535ed34a731c2907f002f7 100644
--- a/python/akg/composite/split_stitch.py
+++ b/python/akg/composite/split_stitch.py
@@ -104,7 +104,9 @@ def _shared_memory_optimization(desc_d, req_map, outputs):
             # rule1: one buffer start larger equal to the reused buffer end.
             if sort_req_liveness[sort_req_buf[i]].start >= sort_req_liveness[sort_req_buf[j]].end:
                 # rule2: sizes are compatible.
-                if req_map[sort_req_buf[i]][0] <= req_map[sort_req_buf[j]][0] and sort_req_buf[j] not in outputs:
+                if sort_req_buf[i] in outputs or sort_req_buf[j] in outputs:
+                    continue
+                if req_map[sort_req_buf[i]][0] <= req_map[sort_req_buf[j]][0]:
                     # rule3: make sure the candidate reused buffer is not using by other conflict variable.
                     for item in reverse_reuse_map.get(sort_req_buf[j], []):
                         if (sort_req_liveness[item].end >= sort_req_liveness[sort_req_buf[i]].end) \
diff --git a/src/poly/create_cluster.cc b/src/poly/create_cluster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f4b173938de4641e0b31165ff477cc037d6257b
--- /dev/null
+++ b/src/poly/create_cluster.cc
@@ -0,0 +1,580 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "create_cluster.h"
+#include "poly/schedule_tree_util.h"
+#include "poly/scop.h"
+#include "poly/dma_inject.h"
+#include "poly/poly_util.h"
+#include <vector>
+
+namespace akg {
+namespace ir {
+namespace poly {
+std::set<std::string> CreateCluster::GetAllPromotedTensor() {
+  std::set<std::string> all_tensors;
+  auto RecordPromotedTensor = [&all_tensors](StmtIdHashMap tensor_map) -> void {
+    for (const auto &item : tensor_map) {
+      for (const auto &item_id : item.second) {
+        all_tensors.emplace(item_id.get_name());
+      }
+    }
+  };
+
+  auto read_map = scop_info_.StmtReadMap();
+  auto write_map = scop_info_.StmtWriteMap();
+  RecordPromotedTensor(read_map);
+  RecordPromotedTensor(write_map);
+  return all_tensors;
+}
+
+std::set<std::string> CreateCluster::GetTempPromotedTensor(std::set<std::string> all_tensors) {
+  auto origin_binds = scop_info_.user_config_.GetOriginBind();
+  std::set<std::string> orig_tensors;
+
+  for (const auto &item : origin_binds) {
+    if (!item.first.defined()) continue;
+    auto id = isl::id(scop_info_.ctx_, item.first->op->name);
+    orig_tensors.insert(id.get_name());
+  }
+  std::set<std::string> temp_tensors;
+  std::set_difference(all_tensors.begin(), all_tensors.end(), orig_tensors.begin(), orig_tensors.end(),
+                      std::inserter(temp_tensors, temp_tensors.begin()));
+  return temp_tensors;
+}
+
+void CreateCluster::RecordInitPromotedTensorType(const std::unordered_set<std::string> &configed_tensors) {
+  std::set<std::string> all_tensors = GetAllPromotedTensor();
+  std::set<std::string> temp_tensors = GetTempPromotedTensor(all_tensors);
+  std::unordered_set<std::string> not_promoted_tensors = scop_info_.analysis_result_.GetTensorsNotPromote();
+
+  // According to the current judgment, initialize the promoted type of all tensor.
+  for (auto tensor : all_tensors) {
+    auto id = isl::id(scop_info_.ctx_, tensor);
+    if (configed_tensors.find(tensor) != configed_tensors.end()) {
+      all_tensors_[id] = PromotedTensorType::CUSTOM;
+    } else if (not_promoted_tensors.find(tensor) != not_promoted_tensors.end()) {
+      all_tensors_[id] = PromotedTensorType::NONE;
+    } else if (temp_tensors.find(tensor) != temp_tensors.end()) {
+      all_tensors_[id] = PromotedTensorType::TEMP;
+    } else {
+      all_tensors_[id] = PromotedTensorType::OTHERS;
+    }
+  }
+}
+
+std::vector<std::pair<isl::id, PromotedTensorType>> CreateCluster::SortPromotedTensorInfo(
+  const PromotedTensor &all_tensors) {
+  // Sort the tensor according to the promoted type
+  auto Compute = [](std::pair<isl::id, PromotedTensorType> a, std::pair<isl::id, PromotedTensorType> b) -> bool {
+    if (a.second == b.second) {
+      return a.first.get_name() < b.first.get_name();
+    }
+    return a.second > b.second;
+  };
+
+  std::vector<std::pair<isl::id, PromotedTensorType>> tensor_list;
+  for (auto it = all_tensors.begin(); it != all_tensors.end(); it++) {
+    // If the current operator does not need to be promoted, it does not need to be sorted.
+    if (it->second == PromotedTensorType::NONE) {
+      continue;
+    }
+    tensor_list.push_back(std::pair<isl::id, PromotedTensorType>(it->first, it->second));
+  }
+
+  std::sort(tensor_list.begin(), tensor_list.end(), Compute);
+  return tensor_list;
+}
+
+// Record the final tensor that needs to be promoted.
+void CreateCluster::RecordPromotedTensorInfo(const isl::schedule_node &orig_node, const std::string &mark_name,
+                                             const PromotedTensor &all_tensors) {
+  auto all_tensors_list = SortPromotedTensorInfo(all_tensors);
+  if (all_tensors.size() <= 0) {
+    return;
+  }
+
+  isl::union_map reads = scop_info_.analysis_result_.GetReads();
+  isl::union_map writes = scop_info_.analysis_result_.GetWrites();
+  isl::union_map copyin = scop_info_.analysis_result_.GetCopyin();
+  isl::union_map fake_copyin = scop_info_.analysis_result_.GetFakeCopyin();
+
+  std::vector<isl::schedule_node> nodes = CollectMarkNode(orig_node, mark_name);
+
+  for (const auto &node : nodes) {
+    auto tree = node.parent();
+    auto partial_sched = GetPartialSchedule(tree);
+
+    for (const auto &tensor : all_tensors_list) {
+      auto promoted_id = tensor.first;
+      BufferDefInfo promoted_info = GetPromotedInfo(promoted_id, mark_name);
+
+      promoted_info.footprints_cluster = TensorFootprintCluster::HoistBufferFootprintCluster(
+        partial_sched, promoted_id, reads, copyin, writes, fake_copyin);
+      if (promoted_info.footprints_cluster == nullptr ||
+          !CheckPromotion(tree, orig_node, *promoted_info.footprints_cluster, tensor)) {
+        continue;
+      }
+
+      promoted_info.footprint_cluster_map.emplace_back(std::make_pair(tree, promoted_info.footprints_cluster));
+      scop_info_.analysis_result_.buffer_def_infos_.push_back(promoted_info);
+    }
+  }
+}
+
+void CreateCluster::RecordGemmTensors() {
+  auto tensors = GetMatmulTensorsName(scop_info_);
+  auto RecordPromotedTensor = [this, &tensors](const std::string &matrix_name) -> void {
+    if (tensors.count(matrix_name) == 0) {
+      return;
+    }
+    auto id = isl::id(scop_info_.ctx_, tensors[matrix_name]);
+    if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::SPECIAL) {
+      all_tensors_[id] = PromotedTensorType::SPECIAL;
+    }
+  };
+
+  RecordPromotedTensor(MATRIX_A);
+  RecordPromotedTensor(MATRIX_B);
+  RecordPromotedTensor(MATRIX_C);
+}
+
+PromotedTensor CreateCluster::GetCurrentMarkerTensors(const bool hoist_tensor_c) {
+  PromotedTensor current_tensors;
+  for (auto &tensor : all_tensors_) {
+    auto id_name = tensor.first.get_name();
+    auto tensor_mark = GetTensorMark(id_name, scop_info_);
+    // Only promote tensor A/B at the position marked A/B.
+    bool is_tensor_ab = !hoist_tensor_c && (tensor_mark == TENSOR_A || tensor_mark == TENSOR_B);
+    // Only promote tensor C at the position marked C.
+    bool is_tensor_c = hoist_tensor_c && tensor_mark == TENSOR_C;
+    if (is_tensor_c || is_tensor_ab) {
+      current_tensors.insert(tensor);
+    }
+  }
+
+  return current_tensors;
+}
+
+/*********************************************
+ * Shared Create Cluster
+ *********************************************/
+bool SharedCreateCluster::CoalescingAccessWay(const isl::schedule_node &node, const isl::schedule_node &root,
+                                              const TensorFootprintCluster &cluster) {
+  isl::union_map original = cluster.OrigianlAccessRelations();
+  size_t tensor_dim = cluster.foot_print_.GetBoxDim();
+  std::vector<isl::schedule_node> thread_marker = CollectFnNode(IsThreadMappedMark, root);
+  for (auto item : thread_marker) {
+    if (!(item.isa<isl::schedule_node_mark>()) && !(item.has_children()) &&
+        !(item.child(0).isa<isl::schedule_node_filter>())) {
+      continue;
+    }
+    isl::schedule_node thread_filter = item.child(0);
+    if (!thread_filter.has_children()) {
+      continue;
+    }
+    isl::schedule_node thread_band = thread_filter.child(0);
+    if (!thread_band.has_children()) {
+      continue;
+    }
+    isl::schedule_node inner_band = thread_band.child(0);
+    size_t num_mapped_thread = inner_band.schedule_depth() - thread_band.schedule_depth();
+    if (num_mapped_thread == 0) {
+      continue;
+    }
+    size_t inner_depth = inner_band.schedule_depth();
+    auto active_domains = CollectDomain(thread_band);
+    auto local_access = original.intersect_domain(active_domains);
+    auto schedule = ShortSchedule(inner_band);
+    auto schedule_access = local_access.apply_domain(schedule);
+    for (auto access : schedule_access.get_map_list()) {
+      if (!IsSubsetForIncreaseDim(access, tensor_dim - 1, inner_depth - 1)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Determine whether the current tensor needs to be promoted.
+bool SharedCreateCluster::CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                                         const TensorFootprintCluster &cluster,
+                                         const std::pair<isl::id, PromotedTensorType> &tensor_info) {
+  if (tensor_info.second > PromotedTensorType::TEMP) {
+    return true;
+  }
+  auto partial_sched_mupa = ShortScheduleMupa(current_node.root(), current_node);
+  auto coalesced_access = scop_info_.analysis_result_.GetOuterBandNode(band_index_)->coalesced_access_tensors;
+  auto tensor_name = tensor_info.first.get_name();
+  if (CoalescingAccessWay(current_node, node, cluster) ||
+      coalesced_access.find(tensor_name) != coalesced_access.end()) {
+    return true;
+  }
+  return false;
+}
+
+isl::union_map SharedCreateCluster::GetPartialSchedule(const isl::schedule_node &node) {
+  auto root_node = node.root();
+  CHECK(!IsAncestorMapToThread(node)) << "shared memory promotion cannot below thread_marker.";
+  auto block_cfg = scop_info_.user_config_.GetBlockConfig();
+  CHECK(block_cfg != nullptr) << "block config is null";
+  auto replace_cfg = scop_info_.user_config_.GetReplaceConfig();
+  MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(band_index_);
+  std::unordered_set<std::string> non_repeated_idx = GetNonRepeatedIdx(mapping_strategy);
+  auto mapping_filter_info = GetMappingFilterInfo(root_node, block_cfg, replace_cfg, non_repeated_idx);
+
+  auto partial_sched = LocalSchedule(node);
+  if (!mapping_filter_info.is_empty()) {
+    partial_sched = partial_sched.intersect_domain(mapping_filter_info);
+  }
+  return partial_sched;
+}
+
+BufferDefInfo SharedCreateCluster::GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) {
+  GpuMemType gpu_mem_type = GpuMemType::SHARED;
+  MemType mem_type = MemType::SHARED_;
+
+  isl::id dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id);
+  if (scop_info_.IsCopyinTensor(promoted_id.get_name()) && band_index_ != 0) {
+    dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id, band_index_);
+  }
+  std::vector<size_t> buffer_sizes;
+  std::vector<std::pair<isl::id, MemType>> data_stream;
+  data_stream.push_back(std::make_pair(promoted_id, MemType::DDR));
+  data_stream.push_back(std::make_pair(promoted_id, mem_type));
+  BufferDefInfo promoted_info = BufferDefInfo{promoted_id,
+                                              dst_tensor_id,
+                                              promoted_id,
+                                              MemType::DDR,
+                                              mark_name,
+                                              false,
+                                              false,
+                                              data_stream,
+                                              Tensor(),
+                                              Handle(),
+                                              buffer_sizes,
+                                              nullptr,
+                                              isl::union_map::empty(isl::space(scop_info_.ctx_, 0))};
+
+  return promoted_info;
+}
+
+void SharedCreateCluster::CreateClusterListForGemm(const isl::schedule_node &node,
+                                                   const std::unordered_set<std::string> &mark_names) {
+  auto configed_tensors = scop_info_.user_config_.GetSharedTensors();
+  // Initialize the promoted types of all tensors.
+  RecordInitPromotedTensorType(configed_tensors);
+  // Modify promoted type of tensor A/B/C for gemm operator.
+  RecordGemmTensors();
+
+  for (const auto &mark_name : mark_names) {
+    bool hoist_tensor_c = mark_name == PROMOTE_GLOBAL_TO_SHARED_C;
+    // Promote the specific tensor at the corresponding marker position.
+    PromotedTensor current_tensors = GetCurrentMarkerTensors(hoist_tensor_c);
+    RecordPromotedTensorInfo(node, mark_name, current_tensors);
+  }
+}
+
+void SharedCreateCluster::CreateClusterListForElementWise(const isl::schedule_node &node,
+                                                          const std::unordered_set<std::string> &mark_names) {
+  auto configed_tensors = scop_info_.user_config_.GetSharedTensors();
+  // Initialize the promoted types of all tensors.
+  RecordInitPromotedTensorType(configed_tensors);
+  for (const auto &mark_name : mark_names) {
+    RecordPromotedTensorInfo(node, mark_name, all_tensors_);
+  }
+}
+
+void SharedCreateCluster::CreateClusterListForReduce(const isl::schedule_node &node,
+                                                     const std::unordered_set<std::string> &mark_names) {
+  auto configed_tensors = scop_info_.user_config_.GetSharedTensors();
+  // Initialize the promoted types of all tensors.
+  RecordInitPromotedTensorType(configed_tensors);
+  // Modify promoted type of the returned tensor for reduce operator.
+  RecordReduceTensors();
+
+  for (const auto &mark_name : mark_names) {
+    RecordPromotedTensorInfo(node, mark_name, all_tensors_);
+  }
+}
+
+void SharedCreateCluster::RecordReduceTensors() {
+  // In order to enable cuda atomic operator, add these tensors for shared memory promotion list
+  auto atomic_tensors = scop_info_.analysis_result_.GetAtomicTensors();
+  if (!atomic_tensors.empty()) {
+    for (const auto &item : atomic_tensors) {
+      auto id = isl::id(scop_info_.ctx_, item.tensor_name);
+      if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::SPECIAL) {
+        all_tensors_[id] = PromotedTensorType::SPECIAL;
+      }
+    }
+  }
+
+  // For the condition that it is without cuda atomic usage, but with reduce operation.
+  // Also need to add these tensors for shared memory promotion list.
+  auto reduce_out_tensors = scop_info_.analysis_result_.GetReduceTensorInfoMap();
+  for (const auto &item : reduce_out_tensors) {
+    auto id = isl::id(scop_info_.ctx_, item.second.write_tensor_name);
+    if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::SPECIAL) {
+      all_tensors_[id] = PromotedTensorType::SPECIAL;
+    }
+  }
+
+  // For the reduce operator, only the return tensor and the temp tensor can be promoted. For ordinary tensor, it will
+  // cause an error in the reduce interface after promotion.
+  for (auto &tensor : all_tensors_) {
+    if (tensor.second == PromotedTensorType::OTHERS) {
+      all_tensors_[tensor.first] = PromotedTensorType::NONE;
+    }
+  }
+}
+
+/*********************************************
+ * Register Create Cluster
+ *********************************************/
+isl::union_map RegisterCreateCluster::GetPartialSchedule(const isl::schedule_node &node) {
+  auto root_node = node.root();
+  auto block_cfg = scop_info_.user_config_.GetBlockConfig();
+  CHECK(block_cfg != nullptr) << "block config is null";
+  auto replace_cfg = scop_info_.user_config_.GetReplaceConfig();
+  MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(band_index_);
+  std::unordered_set<std::string> non_repeated_idx = GetNonRepeatedIdx(mapping_strategy);
+  auto block_mapping = GetMappingFilterInfo(root_node, block_cfg, replace_cfg, non_repeated_idx);
+
+  auto thread_cfg = scop_info_.user_config_.GetThreadConfig();
+  CHECK(thread_cfg != nullptr) << "thread config is null";
+  auto thread_mapping = isl::union_set::empty(block_mapping.ctx());
+  mapping_strategy = scop_info_.user_config_.GetInnerMappingStrategy(band_index_);
+  non_repeated_idx = GetNonRepeatedIdx(mapping_strategy);
+  thread_mapping = GetMappingFilterInfo(root_node, thread_cfg, replace_cfg, non_repeated_idx);
+
+  auto partial_sched = LocalSchedule(node);
+  if (!thread_mapping.is_empty() && !block_mapping.is_empty()) {
+    auto mapping = block_mapping.intersect(thread_mapping);
+    partial_sched = partial_sched.intersect_domain(mapping);
+  } else if (!thread_mapping.is_empty()) {
+    partial_sched = partial_sched.intersect_domain(thread_mapping);
+  } else if (!block_mapping.is_empty()) {
+    partial_sched = partial_sched.intersect_domain(block_mapping);
+  }
+  return partial_sched;
+}
+
+// Check if the given "group" can be promoted to registers for the given mapping to thread identifiers and within the
+// given outer schedule.
+bool RegisterCreateCluster::IsPromote(const TensorFootprintCluster &fp_cluster,
+                                      const isl::multi_union_pw_aff &partial_sched_mupa,
+                                      const isl::multi_union_pw_aff &thread_schedule) {
+  // compute the mapping relation between single thread and outer schedule space and tensor elements pair
+  isl::union_map state_schedule_mapping =
+    ScheduleTensorMapping(partial_sched_mupa, fp_cluster.OrigianlAccessRelations());
+  isl::union_map thread_schedule_mapping = state_schedule_mapping.apply_domain(isl::union_map::from(thread_schedule));
+  // check that whether the mapping relation between single thread and outer schedule points and group elements pair
+  // is injective.
+  return thread_schedule_mapping.is_injective();
+}
+
+// Determine whether the current tensor needs to be promoted.
+bool RegisterCreateCluster::CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                                           const TensorFootprintCluster &cluster,
+                                           const std::pair<isl::id, PromotedTensorType> &tensor_info) {
+  if (tensor_info.second > PromotedTensorType::OTHERS) {
+    return true;
+  }
+  isl::schedule_node root_node = current_node.get_schedule().get_root();
+  auto thread_cfg = scop_info_.user_config_.GetThreadConfig();
+  CHECK(thread_cfg != nullptr) << "thread config is null";
+  auto block_cfg = scop_info_.user_config_.GetBlockConfig();
+  CHECK(block_cfg != nullptr) << "block config is null";
+
+  auto thread_schedule = MapDomainAllWithType(root_node, thread_cfg, scop_info_.upa_node_mapping_, THREAD_MARKER);
+  auto block_schedule = MapDomainAllWithType(root_node, block_cfg, scop_info_.upa_node_mapping_, BLOCK_MARKER);
+  auto tmp_node = current_node;
+  if (current_node.isa<isl::schedule_node_band>()) {
+    tmp_node = current_node.child(0);
+  }
+
+  auto partial_sched_mupa = ShortScheduleMupa(root_node, tmp_node);
+  partial_sched_mupa = partial_sched_mupa.flat_range_product(block_schedule).flat_range_product(thread_schedule);
+  if (IsPromote(cluster, partial_sched_mupa, thread_schedule)) {
+    return true;
+  }
+
+  return false;
+}
+
+BufferDefInfo RegisterCreateCluster::GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) {
+  isl::id dst_tensor_id = GetGpuIndexDstId(GpuMemType::LOCAL, promoted_id);
+  if (scop_info_.IsCopyinTensor(promoted_id.get_name()) && band_index_ != 0) {
+    dst_tensor_id = GetGpuIndexDstId(GpuMemType::LOCAL, promoted_id, band_index_);
+  }
+
+  std::vector<size_t> buffer_sizes;
+  std::vector<std::pair<isl::id, MemType>> data_stream;
+  MemType memtype;
+  isl::id tmp_item;
+  if (!shared_tensor_.count(promoted_id.get_name() + SHARE_SUFFIX)) {
+    tmp_item = promoted_id;
+    data_stream.push_back(std::make_pair(promoted_id, MemType::DDR));
+    data_stream.push_back(std::make_pair(promoted_id, MemType::LOCAL_));
+    memtype = MemType::DDR;
+  } else {
+    tmp_item = isl::id(scop_info_.ctx_, promoted_id.get_name() + SHARE_SUFFIX);
+    data_stream.push_back(std::make_pair(promoted_id, MemType::SHARED_));
+    data_stream.push_back(std::make_pair(promoted_id, MemType::LOCAL_));
+    memtype = MemType::SHARED_;
+  }
+  BufferDefInfo promoted_info = BufferDefInfo{tmp_item,
+                                              dst_tensor_id,
+                                              tmp_item,
+                                              memtype,
+                                              mark_name,
+                                              false,
+                                              false,
+                                              data_stream,
+                                              Tensor(),
+                                              Handle(),
+                                              buffer_sizes,
+                                              nullptr,
+                                              isl::union_map::empty(isl::space(scop_info_.ctx_, 0))};
+
+  return promoted_info;
+}
+
+// Operators that have been promoted to the shared memory do not need to be promoted to the register memory in general.
+// Except for gemm operators.
+void RegisterCreateCluster::RecordSharedPromotedTensors(const bool is_gemm) {
+  for (auto buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
+    shared_tensor_.insert(buffer.second.cluster_id.get_name());
+  }
+
+  if (is_gemm) {
+    return;
+  }
+
+  std::string shared_suffix = SHARE_SUFFIX;
+  for (const auto &item : shared_tensor_) {
+    auto id = isl::id(scop_info_.ctx_, item.substr(0, item.length() - shared_suffix.size()));
+    if (all_tensors_.count(id) == 0 || all_tensors_[id] < PromotedTensorType::NONE) {
+      all_tensors_[id] = PromotedTensorType::NONE;
+    }
+  }
+}
+
+void RegisterCreateCluster::CreateClusterListForGemm(const isl::schedule_node &node,
+                                                     const std::unordered_set<std::string> &mark_names) {
+  auto configed_tensors = scop_info_.user_config_.GetRegisterTensors();
+  // Initialize the promoted types of all tensors.
+  RecordInitPromotedTensorType(configed_tensors);
+  // Statistics shared_tensor_ information.
+  RecordSharedPromotedTensors(true);
+  // Modify promoted type of tensor A/B/C for gemm operator.
+  RecordGemmTensors();
+
+  for (const auto &mark_name : mark_names) {
+    bool hoist_tensor_c = ((mark_name == PROMOTE_GLOBAL_TO_REGISTER_C) || (mark_name == PROMOTE_SHARED_TO_REGISTER_C));
+    // Promote the specific tensor at the corresponding marker position.
+    PromotedTensor current_tensors = GetCurrentMarkerTensors(hoist_tensor_c);
+    RecordPromotedTensorInfo(node, mark_name, current_tensors);
+  }
+}
+
+void RegisterCreateCluster::CreateClusterListForElementWise(const isl::schedule_node &node,
+                                                            const std::unordered_set<std::string> &mark_names) {
+  auto configed_tensors = scop_info_.user_config_.GetRegisterTensors();
+  bool is_enable_vectorization = scop_info_.analysis_result_.GetOuterBandNode(band_index_)->enable_vectorization;
+  // Initialize the promoted types of all tensors.
+  RecordInitPromotedTensorType(configed_tensors);
+  // Delete the tensor that has been promoted on shared memory.
+  RecordSharedPromotedTensors();
+  // Add the tensor that needs to be vectorized.
+  RecordVectorizedPromotedTensors(is_enable_vectorization);
+
+  for (const auto &mark_name : mark_names) {
+    RecordPromotedTensorInfo(node, mark_name, all_tensors_);
+  }
+}
+
+void RegisterCreateCluster::RecordVectorizedPromotedTensors(const bool is_enable_vectorization) {
+  for (auto tensor : all_tensors_) {
+    if (tensor.second > PromotedTensorType::OTHERS) {
+      continue;
+    }
+
+    if (is_enable_vectorization) {
+      all_tensors_[tensor.first] = PromotedTensorType::SPECIAL;
+    } else {
+      all_tensors_[tensor.first] = PromotedTensorType::NONE;
+    }
+  }
+}
+/*********************************************
+ * Cpu Create Cluster
+ *********************************************/
+BufferDefInfo CpuCreateCluster::GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) {
+  GpuMemType gpu_mem_type = GpuMemType::LOCAL;
+  MemType mem_type = MemType::LOCAL_;
+
+  isl::id dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id);
+  if (scop_info_.IsCopyinTensor(promoted_id.get_name()) && band_index_ != 0) {
+    dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, promoted_id, band_index_);
+  }
+  std::vector<size_t> buffer_sizes;
+  std::vector<std::pair<isl::id, MemType>> data_stream;
+  data_stream.push_back(std::make_pair(promoted_id, MemType::DDR));
+  data_stream.push_back(std::make_pair(promoted_id, mem_type));
+  BufferDefInfo promoted_info = BufferDefInfo{promoted_id,
+                                              dst_tensor_id,
+                                              promoted_id,
+                                              MemType::DDR,
+                                              mark_name,
+                                              false,
+                                              false,
+                                              data_stream,
+                                              Tensor(),
+                                              Handle(),
+                                              buffer_sizes,
+                                              nullptr,
+                                              isl::union_map::empty(isl::space(scop_info_.ctx_, 0))};
+
+  return promoted_info;
+}
+
+isl::union_map CpuCreateCluster::GetPartialSchedule(const isl::schedule_node &node) { return LocalSchedule(node); }
+
+// Determine whether the current tensor needs to be promoted.
+bool CpuCreateCluster::CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                                      const TensorFootprintCluster &cluster,
+                                      const std::pair<isl::id, PromotedTensorType> &tensor_info) {
+  auto template_type = scop_info_.analysis_result_.GetOuterBandNode(band_index_)->template_type;
+  return template_type == Template::MATMUL && scop_info_.user_config_.GetEnableMatmul();
+}
+
+void CpuCreateCluster::CreateClusterListForGemm(const isl::schedule_node &node,
+                                                const std::unordered_set<std::string> &mark_names) {
+  auto configed_tensors = scop_info_.user_config_.GetRegisterTensors();
+  // Initialize the promoted types of all tensors.
+  RecordInitPromotedTensorType(configed_tensors);
+
+  for (auto mark_name : mark_names) {
+    // Promote the specific tensor at the corresponding marker position.
+    PromotedTensor current_tensors = GetCurrentMarkerTensors(false);
+    RecordPromotedTensorInfo(node, mark_name, current_tensors);
+  }
+}
+}  // namespace poly
+}  // namespace ir
+}  // namespace akg
diff --git a/src/poly/create_cluster.h b/src/poly/create_cluster.h
new file mode 100644
index 0000000000000000000000000000000000000000..607540a747b3670b1b110d37f7965badbe66feab
--- /dev/null
+++ b/src/poly/create_cluster.h
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef POLY_CREATE_CLUSTER_H_
+#define POLY_CREATE_CLUSTER_H_
+
+#include "poly/schedule_pass.h"
+
+namespace akg {
+namespace ir {
+namespace poly {
+// tensor priority: custom > none > special > temp > others
+// OTHERS:Whether the tensor is promoted or not has no effect on functionality, only on performance
+// TEMP: the temp tensor must be promoted
+// SPECIAL: reduce, gemm, conv, etc. operators must be promoted
+// NONE: the tensor does not need to be promoted
+// CUSTOM: the custom tensor must be promoted
+enum class PromotedTensorType { OTHERS = 0, TEMP, SPECIAL, NONE, CUSTOM };
+using PromotedTensor = std::unordered_map<isl::id, PromotedTensorType, isl::IslIdIslHash>;
+
+class CreateCluster {
+ public:
+  explicit CreateCluster(ScopInfo &scop_info, int band_index) : scop_info_(scop_info), band_index_(band_index) {}
+  ~CreateCluster() {}
+
+ protected:
+  // Record the tensor that needs to be promoted.
+  std::set<std::string> GetAllPromotedTensor();
+  std::set<std::string> GetTempPromotedTensor(std::set<std::string> all_tensors);
+  void RecordInitPromotedTensorType(const std::unordered_set<std::string> &configed_tensors);
+
+  // Sort all tensors by their priority
+  std::vector<std::pair<isl::id, PromotedTensorType>> SortPromotedTensorInfo(const PromotedTensor &all_tensors);
+
+  // Record the final tensor that needs to be promoted.
+  void RecordPromotedTensorInfo(const isl::schedule_node &orig_node, const std::string &mark_name,
+                                const PromotedTensor &all_tensors);
+
+  // Common functions required by shared, register in gpu and cpu.
+  virtual isl::union_map GetPartialSchedule(const isl::schedule_node &node) = 0;
+  virtual BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) = 0;
+  virtual bool CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                              const TensorFootprintCluster &cluster,
+                              const std::pair<isl::id, PromotedTensorType> &tensor_info) = 0;
+
+  // gemm operator
+  void RecordGemmTensors();
+  PromotedTensor GetCurrentMarkerTensors(const bool hoist_tensor_c);
+
+  ScopInfo &scop_info_;
+  PromotedTensor all_tensors_;
+  int band_index_;
+};
+
+class SharedCreateCluster : public CreateCluster {
+ public:
+  explicit SharedCreateCluster(ScopInfo &scop_info, int band_index) : CreateCluster(scop_info, band_index) {}
+  ~SharedCreateCluster() {}
+
+  // Promoted tensors needed to create different types of operators.
+  void CreateClusterListForGemm(const isl::schedule_node &orig_node, const std::unordered_set<std::string> &mark_names);
+  void CreateClusterListForReduce(const isl::schedule_node &orig_node,
+                                  const std::unordered_set<std::string> &mark_names);
+  void CreateClusterListForElementWise(const isl::schedule_node &orig_node,
+                                       const std::unordered_set<std::string> &mark_names);
+
+ private:
+  bool CoalescingAccessWay(const isl::schedule_node &node, const isl::schedule_node &root,
+                           const TensorFootprintCluster &cluster);
+
+  // Common functions required by shared, register in gpu and cpu.
+  bool CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                      const TensorFootprintCluster &cluster,
+                      const std::pair<isl::id, PromotedTensorType> &tensor_info) override;
+  isl::union_map GetPartialSchedule(const isl::schedule_node &node) override;
+  BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) override;
+
+  void RecordReduceTensors();
+};
+
+class RegisterCreateCluster : public CreateCluster {
+ public:
+  explicit RegisterCreateCluster(ScopInfo &scop_info, int band_index) : CreateCluster(scop_info, band_index) {}
+  ~RegisterCreateCluster() {}
+
+  // Promoted tensors needed to create different types of operators.
+  void CreateClusterListForGemm(const isl::schedule_node &orig_node, const std::unordered_set<std::string> &mark_names);
+  void CreateClusterListForElementWise(const isl::schedule_node &orig_node,
+                                       const std::unordered_set<std::string> &mark_names);
+
+  isl::union_map GetPartialSchedule(const isl::schedule_node &node) override;
+
+ private:
+  void RecordSharedPromotedTensors(const bool is_gemm = false);
+  bool IsPromote(const TensorFootprintCluster &fp_cluster, const isl::multi_union_pw_aff &partial_sched_mupa,
+                 const isl::multi_union_pw_aff &thread_schedule);
+
+  // Common functions required by shared, register in gpu and cpu.
+  bool CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                      const TensorFootprintCluster &cluster,
+                      const std::pair<isl::id, PromotedTensorType> &tensor_info) override;
+  BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) override;
+
+  void RecordVectorizedPromotedTensors(const bool is_enable_vectorization);
+
+  std::set<std::string> shared_tensor_;
+};
+
+class CpuCreateCluster : public CreateCluster {
+ public:
+  explicit CpuCreateCluster(ScopInfo &scop_info, int band_index) : CreateCluster(scop_info, band_index) {}
+  ~CpuCreateCluster() {}
+  // Promoted tensors needed to create different types of operators.
+  void CreateClusterListForGemm(const isl::schedule_node &orig_node, const std::unordered_set<std::string> &mark_names);
+
+ private:
+  // Common functions required by shared, register in gpu and cpu.
+  isl::union_map GetPartialSchedule(const isl::schedule_node &node) override;
+  BufferDefInfo GetPromotedInfo(const isl::id &promoted_id, const std::string &mark_name) override;
+  bool CheckPromotion(const isl::schedule_node &current_node, const isl::schedule_node &node,
+                      const TensorFootprintCluster &cluster,
+                      const std::pair<isl::id, PromotedTensorType> &tensor_info) override;
+};
+}  // namespace poly
+}  // namespace ir
+}  // namespace akg
+
+#endif  // POLY_CREATE_CLUSTER_H_
\ No newline at end of file
diff --git a/src/poly/gpu_emit/gpu_isl_emitter.cc b/src/poly/gpu_emit/gpu_isl_emitter.cc
index 93d2f718a611a52af198744024056d80492fc2ef..d637168b2ede9e9553b06a5d62d5b4702e212b8e 100644
--- a/src/poly/gpu_emit/gpu_isl_emitter.cc
+++ b/src/poly/gpu_emit/gpu_isl_emitter.cc
@@ -256,7 +256,6 @@ Stmt GpuIslEmitter::EmitTensorOfTensorStmt(const Stmt &s) {
 }
 
 void GpuIslEmitter::UpdateGpuIndexDtype() {
-
   auto read_map = info_.StmtReadMap();
   auto write_map = info_.StmtWriteMap();
   std::set<std::string> id_sets;
@@ -289,12 +288,9 @@ void GpuIslEmitter::UpdateGpuIndexDtype() {
   }
 
   if (use_int64_idx_gpu) {
-    iter_name_map_ = {{B0, VarExpr(BLOCK_IDX_X, Int(64))},  
-                      {B1, VarExpr(BLOCK_IDX_Y, Int(64))},
-                      {B2, VarExpr(BLOCK_IDX_Z, Int(64))},  
-                      {T0, VarExpr(THREAD_IDX_X, Int(64))},
-                      {T1, VarExpr(THREAD_IDX_Y, Int(64))}, 
-                      {T2, VarExpr(THREAD_IDX_Z, Int(64))}};
+    iter_name_map_ = {{B0, VarExpr(BLOCK_IDX_X, Int(64))},  {B1, VarExpr(BLOCK_IDX_Y, Int(64))},
+                      {B2, VarExpr(BLOCK_IDX_Z, Int(64))},  {T0, VarExpr(THREAD_IDX_X, Int(64))},
+                      {T1, VarExpr(THREAD_IDX_Y, Int(64))}, {T2, VarExpr(THREAD_IDX_Z, Int(64))}};
   }
 }
 
@@ -321,7 +317,6 @@ class InitStmtInsertSync : public IRMutator {
 };
 
 Stmt GpuIslEmitter::Emit(const isl::ast_node &node) {
-
   UpdateGpuIndexDtype();
 
   Stmt stmt = EmitAst(node);
@@ -404,20 +399,11 @@ Stmt GpuIslEmitter::EmitRealizeForGlobalTensor(Stmt stmt) {
 Stmt GpuIslEmitter::EmitMark(const isl::ast_node_mark &node) {
   std::string mark = node.get_id().get_name();
 
-  // add for prefetch pass
-  if (mark == PROMOTE_GLOBAL_TO_SHARED_AB) {
-    Stmt stmt = EmitAst(node.get_node());
-    if (!stmt.defined()) {
-      return Stmt();
-    }
-    return AttrStmt::make(Expr("INFO"), SHARED_MEM_PROMOTED_COMPLETE, StringImm::make(SHARED_MEM_PROMOTED_COMPLETE),
-                          stmt);
-  }
-
   Stmt stmt;
 
   if ((mark == PROMOTE_VECTORIZATION) || (mark == PROMOTE_REGISTER_TO_GLOBAL) || (mark == PROMOTE_REGISTER_TO_SHARED) ||
-      (mark == PROMOTE_SHARED_TO_GLOBAL) || IsStartsWith(mark, REDUCE_ATOMIC_FLAG)) {
+      (mark == PROMOTE_SHARED_TO_GLOBAL) || (mark == SHARED_MEM_PROMOTED_COMPLETE) ||
+      IsStartsWith(mark, REDUCE_ATOMIC_FLAG)) {
     stmt = EmitAst(node.get_node());
     if (!stmt.defined()) {
       return Stmt();
diff --git a/src/poly/gpu_emit/gpu_isl_emitter.h b/src/poly/gpu_emit/gpu_isl_emitter.h
index b621637f6b6efe355f2e804b88e421c9568435d5..ede956f152e6e7ce17a22ec0d23debb87d95945b 100644
--- a/src/poly/gpu_emit/gpu_isl_emitter.h
+++ b/src/poly/gpu_emit/gpu_isl_emitter.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@ constexpr auto MIND_TRICKS_SWIZZLE_PRAGMA = "pragma_swizzle_kernel";
 
 // add for one dimension mapping
 constexpr auto ORIGIN_THREAD_DIM_X = "bind_thread_x";
-constexpr auto SHARED_MEM_PROMOTED_COMPLETE = "shared_mem_promoted_complete";
 
 // example:
 // atomic_SumOp
@@ -62,12 +61,9 @@ class GpuIslEmitter : public IslEmitter {
   virtual Stmt SubstituteTensorStmt(const Stmt &s, Tensor origin, Tensor replaced);
   virtual Stmt EmitTensorOfTensorStmt(const Stmt &s);
   void UpdateGpuIndexDtype();
-  std::map<std::string, VarExpr> iter_name_map_{{B0, VarExpr(BLOCK_IDX_X, Int(32))},  
-                                                {B1, VarExpr(BLOCK_IDX_Y, Int(32))},
-                                                {B2, VarExpr(BLOCK_IDX_Z, Int(32))},  
-                                                {T0, VarExpr(THREAD_IDX_X, Int(32))},
-                                                {T1, VarExpr(THREAD_IDX_Y, Int(32))}, 
-                                                {T2, VarExpr(THREAD_IDX_Z, Int(32))}};
+  std::map<std::string, VarExpr> iter_name_map_{
+    {B0, VarExpr(BLOCK_IDX_X, Int(32))},  {B1, VarExpr(BLOCK_IDX_Y, Int(32))},  {B2, VarExpr(BLOCK_IDX_Z, Int(32))},
+    {T0, VarExpr(THREAD_IDX_X, Int(32))}, {T1, VarExpr(THREAD_IDX_Y, Int(32))}, {T2, VarExpr(THREAD_IDX_Z, Int(32))}};
 
  private:
   // override emitters for GPU
diff --git a/src/poly/poly_util.h b/src/poly/poly_util.h
index 0734391e1d3022dcfb91917cdd910bc0df69085a..13be1ebba52902dbf2ab242e77f5e7baeeb18882 100644
--- a/src/poly/poly_util.h
+++ b/src/poly/poly_util.h
@@ -468,6 +468,7 @@ constexpr auto FOR_VECTORIZED = "for_vectorized";
 constexpr auto FOR_UNROLLED = "for_unrolled";
 constexpr auto FOR_SWIZZLED = "for_swizzled";
 
+constexpr auto SHARED_MEM_PROMOTED_COMPLETE = "shared_mem_promoted_complete";
 constexpr auto PROMOTE_VECTORIZATION = "promote_vectorization";
 constexpr auto PROMOTE_VECTORIZATION_BIT = 128;
 constexpr auto THREAD_MARKER = "thread_marker";
diff --git a/src/poly/schedule_analysis/band_node_analysis.cc b/src/poly/schedule_analysis/band_node_analysis.cc
index a7a9b568637da29e07798dab692efeca56c3f3f1..e774e24b5185e3321f1bfd8e4e28ddc3685a9468 100644
--- a/src/poly/schedule_analysis/band_node_analysis.cc
+++ b/src/poly/schedule_analysis/band_node_analysis.cc
@@ -533,7 +533,7 @@ class OperatorInfoCollector {
 
     if (tensor_c_type == Float(16) && enable_tensor_core) {
       std::string shared_tensors = tensor_a_name + " " + tensor_b_name + " " + tensor_c_name;
-      scop_info_.user_config_.SetSharedTensors(shared_tensors);
+      scop_info_.user_config_.RecordSharedTensors(shared_tensors);
     }
 
     return true;
@@ -610,7 +610,7 @@ void AnalyzeBandNode::AnalyzeAxisPosition() {
     if (target_ == TARGET_CPU) {
       last_axis = GetVectorizationAxisForCpu(bn);
     } else {
-      last_axis = GetCoalescedAccessAxisForCuda(bn->node);
+      last_axis = GetCoalescedAccessAxisForCuda(bn);
     }
     bn->last_axis = last_axis;
   }
@@ -638,7 +638,7 @@ int AnalyzeBandNode::GetVectorizationAxisForCpu(std::unique_ptr<OuterBandNode> &
 }
 
 // For the tensor of tensor operator, confirm whether coalesced access is required in the calculation phase.
-int AnalyzeBandNode::GetCoalescedAccessAxisForCuda(const isl::schedule_node &orig_node) {
+int AnalyzeBandNode::GetCoalescedAccessAxisForCuda(std::unique_ptr<OuterBandNode> &bn) {
   int coalesced_access_axis = -1;
   if (scop_info_.user_config_.GetEnableMatmul()) {
     return coalesced_access_axis;
@@ -647,7 +647,8 @@ int AnalyzeBandNode::GetCoalescedAccessAxisForCuda(const isl::schedule_node &ori
   for (auto inner_tensor : scop_info_.analysis_result_.GetInnerTensor()) {
     skip_tensors.emplace(inner_tensor);
   }
-  coalesced_access_axis = GetLastAxisPos(orig_node, skip_tensors);
+  coalesced_access_axis = GetLastAxisPos(bn->node, skip_tensors);
+  RecordAllCoalescedAccessTensors(bn, skip_tensors);
   return coalesced_access_axis;
 }
 
@@ -676,6 +677,26 @@ int AnalyzeBandNode::GetLastAxisPos(const isl::schedule_node &orig_node, std::un
   return -1;
 }
 
+void AnalyzeBandNode::RecordAllCoalescedAccessTensors(std::unique_ptr<OuterBandNode> &bn,
+                                                      std::unordered_set<std::string> skip_tensors) {
+  if (!bn->node.isa<isl::schedule_node_band>()) {
+    return;
+  }
+
+  // Get read and write tensor information.
+  auto reads_access = scop_info_.analysis_result_.GetReads().domain_factor_domain();
+  reads_access.foreach_map([this, &bn, skip_tensors](const isl::map &map) -> void {
+    auto node = bn->node;
+    int band_number = static_cast<int>(node.as<isl::schedule_node_band>().n_member());
+    std::string id_name = map.range().get_tuple_name();
+    auto access = isl::union_map(map);
+    int last_axis = GetLastAxis(node, access, skip_tensors);
+    if (last_axis != -1 && last_axis < band_number - 1) {
+      bn->coalesced_access_tensors.emplace(id_name);
+    }
+  });
+}
+
 void AnalyzeBandNode::CollectStmtInfo() {
   auto prov_entry = scop_info_.analysis_result_.GetProvideAnalysis();
   auto provides = scop_info_.analysis_result_.GetStatementMap();
diff --git a/src/poly/schedule_analysis/band_node_analysis.h b/src/poly/schedule_analysis/band_node_analysis.h
index 92e28327a8c9bfedd9708cadb09372085013b899..629d7b4f0666bdce4db25f005a029b9b0bbe277a 100644
--- a/src/poly/schedule_analysis/band_node_analysis.h
+++ b/src/poly/schedule_analysis/band_node_analysis.h
@@ -42,8 +42,10 @@ class AnalyzeBandNode {
   void DetermineTemplateOfBand(std::unique_ptr<OuterBandNode> &bn);
   bool IsGemmTempleteInBand(std::unique_ptr<OuterBandNode> &bn);
   int GetVectorizationAxisForCpu(std::unique_ptr<OuterBandNode> &bn);
-  int GetCoalescedAccessAxisForCuda(const isl::schedule_node &orig_node);
+  int GetCoalescedAccessAxisForCuda(std::unique_ptr<OuterBandNode> &bn);
   int GetLastAxisPos(const isl::schedule_node &orig_node, std::unordered_set<std::string> skip_tensors = {});
+  void RecordAllCoalescedAccessTensors(std::unique_ptr<OuterBandNode> &bn,
+                                       std::unordered_set<std::string> skip_tensors = {});
 
   std::string target_;
   const isl::schedule &sch_;
diff --git a/src/poly/schedule_analysis/gpu_dma_analysis.h b/src/poly/schedule_analysis/gpu_dma_analysis.h
index 66ca46053548ed357cb0d6dc22b36c49d1e14b39..000a8aabdbb0a6088034bcfb03e1895589983e70 100644
--- a/src/poly/schedule_analysis/gpu_dma_analysis.h
+++ b/src/poly/schedule_analysis/gpu_dma_analysis.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,9 +27,7 @@ using TensorSets = std::unordered_set<isl::id, isl::IslIdIslHash>;
 class GpuDmaAnalysis {
  public:
   explicit GpuDmaAnalysis(const isl::schedule &sch, ScopInfo &scop_info) : sch_(sch), scop_info_(scop_info) {
-    if (!scop_info.user_config_.GetSharedTensors().empty()) {
-      configed_share_tensors_ = Split(scop_info.user_config_.GetSharedTensors(), " ");
-    }
+    configed_share_tensors_ = scop_info.user_config_.GetSharedTensors();
   };
 
   ~GpuDmaAnalysis() {}
@@ -48,7 +46,7 @@ class GpuDmaAnalysis {
  private:
   const isl::schedule &sch_;
   ScopInfo &scop_info_;
-  std::vector<std::string> configed_share_tensors_;
+  std::unordered_set<std::string> configed_share_tensors_;
   const int MAX_STRIDE = 65535;
 };
 
diff --git a/src/poly/schedule_pass/tile_outer_band.cc b/src/poly/schedule_pass/tile_outer_band.cc
index e25016d72c41f65ea6fbc511efd25aef30a17812..062c0a94d423b85d053bb90e9e1d26e6941e6bd1 100644
--- a/src/poly/schedule_pass/tile_outer_band.cc
+++ b/src/poly/schedule_pass/tile_outer_band.cc
@@ -1092,6 +1092,7 @@ isl::schedule_node TileOuterBand::MarkOuterPermutableCuda(isl::schedule_node nod
   // vectorize for elementwise operator
   if (scop_info_.analysis_result_.GetOuterBandNode(cur_band_index_)->enable_vectorization) {
     node = SetTileSizeAndTile(node, TILE_WITH_C0);
+    node = node.child(0).insert_mark(PROMOTE_GLOBAL_TO_REGISTER);
   }
   node = node.ancestor(node.get_tree_depth() - start_depth);
 
@@ -1261,7 +1262,6 @@ isl::schedule_node TileOuterBand::InsertPromoteMarker(const isl::schedule_node n
   // Add different promotion marks in different positions.
   if (is_matrixc_promote_shared) {
     tile_node = tile_node.insert_mark(isl::id(tile_node.ctx(), PROMOTE_GLOBAL_TO_SHARED_C)).child(0);
-    tile_node = tile_node.insert_mark(isl::id(tile_node.ctx(), PROMOTE_SHARED_TO_REGISTER_C)).child(0);
   } else {
     tile_node = tile_node.insert_mark(isl::id(tile_node.ctx(), PROMOTE_GLOBAL_TO_REGISTER_C)).child(0);
   }
@@ -1271,21 +1271,16 @@ isl::schedule_node TileOuterBand::InsertPromoteMarker(const isl::schedule_node n
 }
 
 bool TileOuterBand::IsMatrixCPromoteToShared() {
-  std::string shared_tensors = scop_info_.user_config_.GetSharedTensors();
+  std::unordered_set<std::string> shared_tensors = scop_info_.user_config_.GetSharedTensors();
   if (shared_tensors.empty()) {
     return false;
   }
 
-  shared_tensors += " ";
-  auto pos = shared_tensors.find(" ");
-  while (pos != std::string::npos) {
-    std::string tensor = shared_tensors.substr(0, pos);
+  for (const auto &tensor : shared_tensors) {
     auto matmul_map = scop_info_.analysis_result_.GetMatrixMatmulMap();
     if (matmul_map.count(tensor) && (matmul_map[tensor] == MATRIX_C || matmul_map[tensor] == MATRIX_ELSE)) {
       return true;
     }
-    shared_tensors = shared_tensors.substr(pos + 1, shared_tensors.size());
-    pos = shared_tensors.find(" ");
   }
   return false;
 }
diff --git a/src/poly/schedule_pass_cpu/cpu_memory_manager.cc b/src/poly/schedule_pass_cpu/cpu_memory_manager.cc
index a1359872afe3db0b4a565a79dc704395bfe32d9d..d9c210d000b6c4f0eed31aaaf8e20425257fe848 100644
--- a/src/poly/schedule_pass_cpu/cpu_memory_manager.cc
+++ b/src/poly/schedule_pass_cpu/cpu_memory_manager.cc
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "poly/schedule_pass_gpu/operator_shared_strategy.h"
+#include "poly/create_cluster.h"
 #include "cpu_memory_manager.h"
 #include "poly/schedule_tree_util.h"
 #include "poly/scop.h"
@@ -59,8 +59,8 @@ isl::schedule CpuMemoryManager::HoistCpuMemory() {
     }
 
     mark_names_ = {PROMOTE_GLOBAL_TO_REGISTER};
-    CpuMemoryStrategy other_op(scop_info_, mark_names_, band_index_);
-    other_op.CreateClusterList(orig_node);
+    CpuCreateCluster create_cluster(scop_info_, band_index_);
+    create_cluster.CreateClusterListForGemm(orig_node, mark_names_);
     auto node = orig_node;
     for (auto name : mark_names_) {
       mark_name = name;
@@ -188,42 +188,6 @@ isl::schedule_node CpuMemoryManager::HoistMemory(isl::schedule_node &tree, GpuMe
   return res_node;
 }
 
-bool CpuMemoryManager::CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster) {
-  isl::union_map original = cluster.OrigianlAccessRelations();
-  size_t tensor_dim = cluster.foot_print_.GetBoxDim();
-  std::vector<isl::schedule_node> thread_marker = CollectFnNode(IsThreadMappedMark, schedule_.root());
-  for (auto item : thread_marker) {
-    if (!(item.isa<isl::schedule_node_mark>()) && !(item.has_children()) &&
-        !(item.child(0).isa<isl::schedule_node_filter>())) {
-      continue;
-    }
-    isl::schedule_node thread_filter = item.child(0);
-    if (!thread_filter.has_children()) {
-      continue;
-    }
-    isl::schedule_node thread_band = thread_filter.child(0);
-    if (!thread_band.has_children()) {
-      continue;
-    }
-    isl::schedule_node inner_band = thread_band.child(0);
-    size_t num_mapped_thread = inner_band.schedule_depth() - thread_band.schedule_depth();
-    if (num_mapped_thread == 0) {
-      continue;
-    }
-    size_t inner_depth = inner_band.schedule_depth();
-    auto active_domains = CollectDomain(thread_band);
-    auto local_access = original.intersect_domain(active_domains);
-    auto schedule = ShortSchedule(inner_band);
-    auto schedule_access = local_access.apply_domain(schedule);
-    for (auto access : schedule_access.get_map_list()) {
-      if (!IsSubsetForIncreaseDim(access, tensor_dim - 1, inner_depth - 1)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 isl::schedule CpuMemoryManager::InsertVectorizedMarker(const isl::schedule &sch) {
   auto GetPromotedWriteFilter = [this](isl::schedule_node node) -> isl::schedule_node {
     if (!node.isa<isl::schedule_node_band>() || !node.has_parent() || !node.parent().isa<isl::schedule_node_filter>()) {
diff --git a/src/poly/schedule_pass_cpu/cpu_memory_manager.h b/src/poly/schedule_pass_cpu/cpu_memory_manager.h
index bfd1528dfb0cdeb0fe4a923923bfad3eb1ec8a47..0a90a21f0ade438f9b1770934551bd60d4e94a4a 100644
--- a/src/poly/schedule_pass_cpu/cpu_memory_manager.h
+++ b/src/poly/schedule_pass_cpu/cpu_memory_manager.h
@@ -43,8 +43,6 @@ class CpuMemoryManager : public SchedulePass {
                                  const isl::id &dst_tensor_id, TensorFootprintCluster &cluster,
                                  bool force_last_extension_odd);
 
-  bool CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster);
-
   isl::schedule InsertVectorizedMarker(const isl::schedule &sch);
 
   isl::schedule HoistCpuMemory();
diff --git a/src/poly/schedule_pass_gpu/mapping_outer_band.cc b/src/poly/schedule_pass_gpu/mapping_outer_band.cc
index 50c71391aceb5f63a83728d26a9007be4ba35653..0597d286fab56c8379755aa0295ca07fcea3666c 100644
--- a/src/poly/schedule_pass_gpu/mapping_outer_band.cc
+++ b/src/poly/schedule_pass_gpu/mapping_outer_band.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -401,31 +401,35 @@ isl::schedule_node MappingOuterBand::DoThreadMapping(const isl::schedule_node &o
       thread_record.emplace_back(std::make_pair(node, mapped_threads));
       return node;
     }
-
-    if (node.n_children() <= 1 || NumMappedDescendant(thread_record, node) <= 0) {
-      return node;
-    }
-    node = MapSequenceNode(node, thread_record);
-
-    auto need_sync = node.isa<isl::schedule_node_sequence>();
-    if (need_sync) {
-      if (is_reduce_stmt && node.has_parent() && !GetMarkerName(node.parent(), INSERT_SYNC).empty()) {
-        node = node.parent().del();
-        node = DoThreadSynchronization(node);
-      } else if (!is_reduce_stmt && scop_info_.user_config_.GetEnableTensorCoreUsePoly()) {
-        std::vector<MappingCfg *> other_mapping_cfg;
-        other_mapping_cfg.push_back(scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE]);
-        node = DoThreadSynchronization(node, other_mapping_cfg);
-      } else if (!is_reduce_stmt) {
-        node = DoThreadSynchronization(node);
-      }
-    }
-
+    node = DoSequenceNodeMapping(node, thread_record, is_reduce_stmt);
     return node;
   };
   return orig_node.map_descendant_bottom_up(MapFromInner);
 }
 
+isl::schedule_node MappingOuterBand::DoSequenceNodeMapping(const isl::schedule_node &orig_node,
+                                                           const RoadMap &thread_record, const bool is_reduce_stmt) {
+  if (orig_node.n_children() <= 1 || NumMappedDescendant(thread_record, orig_node) <= 0) {
+    return orig_node;
+  }
+  isl::schedule_node node = MapSequenceNode(orig_node, thread_record);
+
+  auto need_sync = node.isa<isl::schedule_node_sequence>();
+  if (need_sync) {
+    if (is_reduce_stmt && node.has_parent() && !GetMarkerName(node.parent(), INSERT_SYNC).empty()) {
+      node = node.parent().del();
+      node = DoThreadSynchronization(node);
+    } else if (!is_reduce_stmt && scop_info_.user_config_.GetEnableTensorCoreUsePoly()) {
+      std::vector<MappingCfg *> other_mapping_cfg;
+      other_mapping_cfg.push_back(scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE]);
+      node = DoThreadSynchronization(node, other_mapping_cfg);
+    } else if (!is_reduce_stmt) {
+      node = DoThreadSynchronization(node);
+    }
+  }
+  return node;
+}
+
 void MappingOuterBand::AdjustBlockConfig(MappingCfg *block_cfg, unsigned long n_block_map) {
   // For scalar case that do not consider coincidence (reset during restart in pass mgr), there is usually only one
   // member in outer band and we can map the maximal block size to that member.
diff --git a/src/poly/schedule_pass_gpu/mapping_outer_band.h b/src/poly/schedule_pass_gpu/mapping_outer_band.h
index c3ee450ac1f8a6c20ef4ad7acd2a804ae868936f..10c45fff7429b865e1363411560a3f1312ddcad5 100644
--- a/src/poly/schedule_pass_gpu/mapping_outer_band.h
+++ b/src/poly/schedule_pass_gpu/mapping_outer_band.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,11 +47,15 @@ class MappingOuterBand : public SchedulePass {
   size_t NumMappedDescendant(const RoadMap &thread_roadmap, const isl::schedule_node &parent);
   bool CanBeMappedToThread(const isl::schedule_node &node, const RoadMap &thread_record,
                            const std::string &marker_name);
-  isl::schedule_node FillRemainingThreads(const isl::schedule_node &orig_node, size_t begin);
-  isl::schedule_node MapSequenceNode(const isl::schedule_node &orig_node, const RoadMap &thread_record);
   bool IsEnableReduceLib(const isl::schedule_node &orig_node);
   void AdjustBlockConfig(MappingCfg *block_cfg, unsigned long n_block_map);
 
+  // Sequence node mapping
+  isl::schedule_node DoSequenceNodeMapping(const isl::schedule_node &orig_node, const RoadMap &thread_record,
+                                           const bool is_reduce_stmt);
+  isl::schedule_node MapSequenceNode(const isl::schedule_node &orig_node, const RoadMap &thread_record);
+  isl::schedule_node FillRemainingThreads(const isl::schedule_node &orig_node, size_t begin);
+
   // Functions related to synchronization.
   isl::schedule_node DoThreadSynchronization(const isl::schedule_node &node,
                                              const std::vector<MappingCfg *> &other_mapping_cfg = {});
diff --git a/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc b/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc
index 730a11226e53689e5417814da6e54cae9a5b383a..5310657106e6284b85df6ed562ddddff00f74b42 100644
--- a/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc
+++ b/src/poly/schedule_pass_gpu/operator_mapping_strategy.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -136,8 +136,10 @@ isl::schedule_node OperatorMappingStrategy::MapDimToThreadsBlocks(const isl::sch
   if (is_promotion_mapping_ || scop_info_.user_config_.GetMindTrickWasUsed()) {
     node = CheckMapSizeAndApplyTile(node, upa_list, required_mapping_strategy_, mapping_cfg_);
     is_tiled = !node.is_equal(orig_node);
+    // insert node with specific marker
     if (is_insert_marker) {
       std::string marker_name = is_thread_mapping_ ? THREAD_MARKER : BLOCK_MARKER;
+      marker_name = is_promotion_mapping_ ? marker_name + SHARE_SUFFIX : marker_name;
       node = node.insert_mark(isl::id(node.ctx(), marker_name)).child(0);
     }
 
@@ -600,8 +602,9 @@ isl::schedule_node ReduceMappingStrategy::InsertReduceExtension(const isl::sched
   insert_node = InsertExtensionNodeBeforeOrAfter(insert_node, sync_id, false).parent();
   insert_node = insert_node.parent().insert_mark(REDUCE_AREA_FLAG);
 
-  if (!GetMarkerName(insert_node.ancestor(2), REDUCE_MARKER).empty()) {
-    insert_node = insert_node.ancestor(2).del();
+  auto tmp_node = insert_node.parent().parent();
+  if (!GetMarkerName(tmp_node, REDUCE_MARKER).empty()) {
+    insert_node = tmp_node.del();
   }
 
   return insert_node;
diff --git a/src/poly/schedule_pass_gpu/operator_shared_strategy.cc b/src/poly/schedule_pass_gpu/operator_shared_strategy.cc
deleted file mode 100644
index 3d97167279479d07611425c82b4ee3902ab72c17..0000000000000000000000000000000000000000
--- a/src/poly/schedule_pass_gpu/operator_shared_strategy.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/**
- * Copyright 2021-2022 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "operator_shared_strategy.h"
-#include "poly/schedule_tree_util.h"
-#include "poly/scop.h"
-#include "poly/dma_inject.h"
-#include "poly/poly_util.h"
-#include <vector>
-#include <numeric>
-
-namespace akg {
-namespace ir {
-namespace poly {
-
-std::set<std::string> OperatorSharedStrategy::GetInitPromotedTensor() {
-  auto read_map = scop_info_.StmtReadMap();
-  auto write_map = scop_info_.StmtWriteMap();
-  std::set<std::string> id_sets;
-  std::set<std::string> read_sets;
-  std::set<std::string> write_sets;
-  for (auto item : read_map) {
-    for (auto item_id : item.second) {
-      if (read_sets.count(item_id.get_name()) == 0) {
-        read_sets.insert(item_id.get_name());
-      }
-    }
-  }
-  for (auto item : write_map) {
-    for (auto item_id : item.second) {
-      if (write_sets.count(item_id.get_name()) == 0) {
-        write_sets.insert(item_id.get_name());
-      }
-    }
-  }
-  /*********************************************************
-   * manage only read tensors to share memory
-   * for read and write tensor, should be managed to local memory
-   ********************************************************/
-  std::set_difference(read_sets.begin(), read_sets.end(), write_sets.begin(), write_sets.end(),
-                      std::inserter(id_sets, id_sets.begin()));
-
-  if (scop_info_.analysis_result_.GetTensorOfTensor() ||
-      scop_info_.analysis_result_.GetOpTemplate() == Template::COUNT_OP) {
-    id_sets.clear();
-    std::set_union(read_sets.begin(), read_sets.end(), write_sets.begin(), write_sets.end(),
-                   std::inserter(id_sets, id_sets.begin()));
-  }
-
-  return id_sets;
-}
-
-void OperatorSharedStrategy::RecordPromotedTensorInfo(const isl::schedule_node &orig_node,
-                                                      const std::set<std::string> &id_sets,
-                                                      const std::string &mark_name) {
-  std::vector<isl::id> tensor_list;
-  for (auto item : id_sets) {
-    tensor_list.push_back(isl::id(scop_info_.ctx_, item));
-  }
-  isl::union_map reads = scop_info_.analysis_result_.GetReads();
-  isl::union_map writes = scop_info_.analysis_result_.GetWrites();
-  isl::union_map copyin = scop_info_.analysis_result_.GetCopyin();
-  isl::union_map fake_copyin = scop_info_.analysis_result_.GetFakeCopyin();
-
-  std::vector<isl::schedule_node> nodes = CollectMarkNode(orig_node, mark_name);
-
-  // Collect block config.
-  auto block_cfg = scop_info_.user_config_.GetBlockConfig();
-  CHECK(block_cfg != nullptr) << "block config is null";
-  auto replace_cfg = scop_info_.user_config_.GetReplaceConfig();
-  MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(band_index_);
-  std::unordered_set<std::string> non_repeated_idx = GetNonRepeatedIdx(mapping_strategy);
-  auto mapping_filter_info = GetMappingFilterInfo(orig_node.root(), block_cfg, replace_cfg, non_repeated_idx);
-
-  for (const auto &node : nodes) {
-    auto tree = node.parent();
-    CHECK(!IsAncestorMapToThread(tree)) << "shared memory promotion cannot below thread_marker.";
-    auto partial_sched = LocalSchedule(tree);
-    if (!mapping_filter_info.is_empty()) {
-      partial_sched = partial_sched.intersect_domain(mapping_filter_info);
-    }
-
-    for (const auto &item : tensor_list) {
-      GpuMemType gpu_mem_type = GpuMemType::SHARED;
-      MemType mem_type = MemType::SHARED_;
-      if (is_local_) {
-        gpu_mem_type = GpuMemType::LOCAL;
-        mem_type = MemType::LOCAL_;
-      }
-
-      isl::id dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, item);
-      if (scop_info_.IsCopyinTensor(item.get_name()) && band_index_ != 0) {
-        dst_tensor_id = GetGpuIndexDstId(gpu_mem_type, item, band_index_);
-      }
-      std::vector<size_t> buffer_sizes;
-      std::vector<std::pair<isl::id, MemType>> data_stream;
-      data_stream.push_back(std::make_pair(item, MemType::DDR));
-      data_stream.push_back(std::make_pair(item, mem_type));
-      BufferDefInfo promoted_info = BufferDefInfo{item,
-                                                  dst_tensor_id,
-                                                  item,
-                                                  MemType::DDR,
-                                                  mark_name,
-                                                  false,
-                                                  false,
-                                                  data_stream,
-                                                  Tensor(),
-                                                  Handle(),
-                                                  buffer_sizes,
-                                                  nullptr,
-                                                  isl::union_map::empty(isl::space(scop_info_.ctx_, 0))};
-      promoted_info.footprints_cluster =
-        TensorFootprintCluster::HoistBufferFootprintCluster(partial_sched, item, reads, copyin, writes, fake_copyin);
-      if (promoted_info.footprints_cluster == nullptr) {
-        continue;
-      }
-      promoted_info.footprint_cluster_map.emplace_back(std::make_pair(tree, promoted_info.footprints_cluster));
-      scop_info_.analysis_result_.buffer_def_infos_.push_back(promoted_info);
-    }
-  }
-}
-
-void OperatorSharedStrategy::RecordCustomPromotedTensors(std::set<std::string> &id_sets) {
-  if (scop_info_.user_config_.GetSharedTensors().empty()) {
-    return;
-  }
-  std::vector<std::string> configed_tensors = Split(scop_info_.user_config_.GetSharedTensors(), " ");
-  for (const auto &item : configed_tensors) {
-    if (id_sets.count(item) == 0) {
-      id_sets.emplace(item);
-    }
-  }
-}
-
-void OperatorSharedStrategy::DeleteNotPromotedTensors(std::set<std::string> &id_sets) {
-  if (scop_info_.analysis_result_.GetTensorsNotPromote().empty()) {
-    return;
-  }
-  std::unordered_set<std::string> tensors = scop_info_.analysis_result_.GetTensorsNotPromote();
-  for (const auto &item : tensors) {
-    if (id_sets.count(item)) {
-      id_sets.erase(item);
-    }
-  }
-}
-
-void OperatorSharedStrategy::CreateClusterList(const isl::schedule_node &node) {
-  std::set<std::string> id_sets = GetInitPromotedTensor();
-  RecordCustomPromotedTensors(id_sets);
-  DeleteNotPromotedTensors(id_sets);
-  for (auto mark_name : mark_names_) {
-    RecordPromotedTensorInfo(node, id_sets, mark_name);
-  }
-}
-
-void ReduceSharedStrategy::CreateClusterList(const isl::schedule_node &node) {
-  std::set<std::string> id_sets = AnalysisReduceTensors();
-  RecordCustomPromotedTensors(id_sets);
-  for (auto mark_name : mark_names_) {
-    RecordPromotedTensorInfo(node, id_sets, mark_name);
-  }
-}
-
-std::set<std::string> ReduceSharedStrategy::AnalysisReduceTensors() {
-  std::set<std::string> id_sets;
-  /*************************************************
-   * In order to enable cuda atomic operator, add
-   * these tensors for shared memory promotion list
-   *************************************************/
-  auto atomic_tensors = scop_info_.analysis_result_.GetAtomicTensors();
-  if (!atomic_tensors.empty()) {
-    for (const auto &item : atomic_tensors) {
-      if (id_sets.count(item.tensor_name) == 0) {
-        id_sets.emplace(item.tensor_name);
-      }
-    }
-  }
-
-  /***********************************************
-   * For the condition that it is without cuda
-   * atomic usage, but with reduce operation.
-   * Also need to add these tensors for shared memory
-   * promotion list.
-   *********************************************/
-  auto reduce_out_tensors = scop_info_.analysis_result_.GetReduceTensorInfoMap();
-  for (const auto &item : reduce_out_tensors) {
-    if (id_sets.count(item.second.write_tensor_name) == 0) {
-      id_sets.emplace(item.second.write_tensor_name);
-    }
-  }
-
-  return id_sets;
-}
-
-void BatchMatmulSharedStrategy::CreateClusterList(const isl::schedule_node &node) {
-  std::set<std::string> id_sets = GetInitPromotedTensor();
-  RecordCustomPromotedTensors(id_sets);
-
-  auto tensors = GetMatmulTensorsName(scop_info_);
-  if (id_sets.count(tensors[MATRIX_A]) == 0) {
-    id_sets.emplace(tensors[MATRIX_A]);
-  }
-  if (id_sets.count(tensors[MATRIX_B]) == 0) {
-    id_sets.emplace(tensors[MATRIX_B]);
-  }
-
-  auto DeleteTensorSets = [this](const std::set<std::string> &id_sets,
-                                 const bool hoist_tensor_c) -> std::set<std::string> {
-    std::set<std::string> final_id_sets = id_sets;
-    auto it = final_id_sets.begin();
-    while (it != final_id_sets.end()) {
-      if (!hoist_tensor_c) {
-        if (GetTensorMark(*it, scop_info_) == TENSOR_C) {
-          it = final_id_sets.erase(it);
-          continue;
-        }
-      } else {
-        if (GetTensorMark(*it, scop_info_) != TENSOR_C) {
-          it = final_id_sets.erase(it);
-          continue;
-        }
-      }
-      ++it;
-    }
-    return final_id_sets;
-  };
-
-  for (auto mark_name : mark_names_) {
-    bool hoist_tensor_c = mark_name == PROMOTE_GLOBAL_TO_SHARED_C;
-    auto final_id_sets = DeleteTensorSets(id_sets, hoist_tensor_c);
-    RecordPromotedTensorInfo(node, final_id_sets, mark_name);
-  }
-}
-
-std::set<std::string> CpuMemoryStrategy::GetInitPromotedTensor() {
-  auto read_map = scop_info_.StmtReadMap();
-  auto write_map = scop_info_.StmtWriteMap();
-  std::set<std::string> id_sets;
-  for (auto item : read_map) {
-    for (auto item_id : item.second) {
-      if (id_sets.count(item_id.get_name()) == 0) {
-        id_sets.insert(item_id.get_name());
-      }
-    }
-  }
-
-  for (auto item : write_map) {
-    for (auto item_id : item.second) {
-      if (id_sets.count(item_id.get_name()) == 0) {
-        id_sets.insert(item_id.get_name());
-      }
-    }
-  }
-
-  return id_sets;
-}
-
-void CpuMemoryStrategy::CreateClusterList(const isl::schedule_node &node) {
-  std::set<std::string> id_sets = GetInitPromotedTensor();
-  RecordCustomPromotedTensors(id_sets);
-  DeleteNotPromotedTensors(id_sets);
-  auto it = id_sets.begin();
-  while (it != id_sets.end()) {
-    if (GetTensorMark(*it, scop_info_) == TENSOR_C) {
-      it = id_sets.erase(it);
-      continue;
-    }
-    ++it;
-  }
-
-  for (auto mark_name : mark_names_) {
-    RecordPromotedTensorInfo(node, id_sets, mark_name);
-  }
-}
-
-}  // namespace poly
-}  // namespace ir
-}  // namespace akg
diff --git a/src/poly/schedule_pass_gpu/operator_shared_strategy.h b/src/poly/schedule_pass_gpu/operator_shared_strategy.h
deleted file mode 100644
index 124caf8737c9199fa0757ea4c53734ca2772e355..0000000000000000000000000000000000000000
--- a/src/poly/schedule_pass_gpu/operator_shared_strategy.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Copyright 2021 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef POLY_OPRATOR_SHARED_STRATEGY_H_
-#define POLY_OPRATOR_SHARED_STRATEGY_H_
-
-#include "poly/schedule_pass.h"
-
-namespace akg {
-namespace ir {
-namespace poly {
-
-class OperatorSharedStrategy {
- public:
-  explicit OperatorSharedStrategy(ScopInfo &scop_info, std::unordered_set<std::string> &mark_names, int filter_pos)
-      : scop_info_(scop_info), mark_names_(mark_names), band_index_(filter_pos) {}
-  ~OperatorSharedStrategy() {}
-
-  std::set<std::string> GetInitPromotedTensor();
-  void RecordPromotedTensorInfo(const isl::schedule_node &orig_node, const std::set<std::string> &id_sets,
-                                const std::string &mark_names);
-  void CreateClusterList(const isl::schedule_node &node);
-  void RecordCustomPromotedTensors(std::set<std::string> &id_sets);
-  void DeleteNotPromotedTensors(std::set<std::string> &id_sets);
-
- protected:
-  ScopInfo &scop_info_;
-  std::unordered_set<std::string> mark_names_;
-  int band_index_;
-  bool is_local_{false};
-};
-
-class ReduceSharedStrategy : public OperatorSharedStrategy {
- public:
-  explicit ReduceSharedStrategy(ScopInfo &scop_info, std::unordered_set<std::string> &mark_names, int filter_pos)
-      : OperatorSharedStrategy(scop_info, mark_names, filter_pos) {}
-  ~ReduceSharedStrategy() {}
-
-  void CreateClusterList(const isl::schedule_node &node);
-  std::set<std::string> AnalysisReduceTensors();
-};
-
-class BatchMatmulSharedStrategy : public OperatorSharedStrategy {
- public:
-  explicit BatchMatmulSharedStrategy(ScopInfo &scop_info, std::unordered_set<std::string> &mark_names, int filter_pos)
-      : OperatorSharedStrategy(scop_info, mark_names, filter_pos) {}
-  ~BatchMatmulSharedStrategy() {}
-
-  void CreateClusterList(const isl::schedule_node &node);
-};
-
-class CpuMemoryStrategy : public OperatorSharedStrategy {
- public:
-  explicit CpuMemoryStrategy(ScopInfo &scop_info, std::unordered_set<std::string> &mark_names, int filter_pos)
-      : OperatorSharedStrategy(scop_info, mark_names, filter_pos) {
-    is_local_ = true;
-  }
-  ~CpuMemoryStrategy() {}
-  std::set<std::string> GetInitPromotedTensor();
-  void CreateClusterList(const isl::schedule_node &node);
-};
-}  // namespace poly
-}  // namespace ir
-}  // namespace akg
-
-#endif  // POLY_OPRATOR_SHARED_STRATEGY_H_
\ No newline at end of file
diff --git a/src/poly/schedule_pass_gpu/register_memory_manager.cc b/src/poly/schedule_pass_gpu/register_memory_manager.cc
index 3ca6fef22b83e330b9281f7b180a2ee1679d78da..7cec1b89dc485aa42bb9fb1919cf645aa9313fe6 100644
--- a/src/poly/schedule_pass_gpu/register_memory_manager.cc
+++ b/src/poly/schedule_pass_gpu/register_memory_manager.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,332 +15,192 @@
  */
 
 #include "register_memory_manager.h"
-
-#include <numeric>
-
+#include "poly/create_cluster.h"
 #include "poly/scop.h"
 #include "poly/dma_inject.h"
 #include "poly/poly_util.h"
 
+#include <numeric>
 namespace akg {
 namespace ir {
 namespace poly {
 
-void RegisterMemoryManager::GetActualPromotedSharedTensors() {
-  for (const auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
-    auto cluster_id = buffer.second.cluster_id;
-    shared_tensors_ += cluster_id.name() + " ";
+isl::schedule RegisterMemoryManager::Run(isl::schedule sch) {
+  if (!scop_info_.user_config_.GetUseRegisterMemory()) {
+    return sch;
   }
-}
 
-isl::schedule RegisterMemoryManager::HoistRegisterMemoryOnDepth(isl::schedule_node &node, size_t depth) {
-  auto res_node = node;
-  isl::schedule_node root_node = node.get_schedule().get_root();
+  schedule_ = sch;
+  sch = HoistRegisterMemory();
+  return sch;
+}
 
-  auto block_cfg = scop_info_.user_config_.GetBlockConfig();
-  CHECK(block_cfg != nullptr) << "block config is null";
-  auto replace_cfg = scop_info_.user_config_.GetReplaceConfig();
-  MappingStrategyAxisMap mapping_strategy = scop_info_.user_config_.GetOuterMappingStrategy(0);
-  std::unordered_set<std::string> non_repeated_idx = GetNonRepeatedIdx(mapping_strategy);
-  auto block_mapping = GetMappingFilterInfo(root_node, block_cfg, replace_cfg, non_repeated_idx);
-
-  auto thread_cfg = scop_info_.user_config_.GetThreadConfig();
-  CHECK(thread_cfg != nullptr) << "thread config is null";
-  auto thread_mapping = isl::union_set::empty(block_mapping.ctx());
-  mapping_strategy = scop_info_.user_config_.GetInnerMappingStrategy(0);
-  non_repeated_idx = GetNonRepeatedIdx(mapping_strategy);
-  thread_mapping = GetMappingFilterInfo(root_node, thread_cfg, replace_cfg, non_repeated_idx);
-
-  auto partial_sched = LocalSchedule(node);
-  if (!thread_mapping.is_empty() && !block_mapping.is_empty()) {
-    auto mapping = block_mapping.intersect(thread_mapping);
-    partial_sched = partial_sched.intersect_domain(mapping);
-  } else if (!thread_mapping.is_empty()) {
-    partial_sched = partial_sched.intersect_domain(thread_mapping);
-  } else if (!block_mapping.is_empty()) {
-    partial_sched = partial_sched.intersect_domain(block_mapping);
+isl::schedule_node RegisterMemoryManager::HoistRegisterMemoryOnMark(const isl::schedule_node &orig_node) {
+  current_outer_bn_ = scop_info_.analysis_result_.GetOuterBandNode(band_index_);
+  if (!current_outer_bn_->use_register_memory) {
+    return orig_node;
   }
 
-  CreateTensorCluster(node, partial_sched);
+  CreateClusterForOperator(orig_node);
 
-  isl::schedule sch = schedule_;
+  std::string mark_name;
+  auto GetMarkNode = [this, &mark_name](isl::schedule_node node) -> isl::schedule_node {
+    if (!node.isa<isl::schedule_node_mark>()) {
+      return node;
+    }
 
-  auto thread_schedule = MapDomainAllWithType(root_node, thread_cfg, scop_info_.upa_node_mapping_, THREAD_MARKER);
-  auto block_schedule = MapDomainAllWithType(root_node, block_cfg, scop_info_.upa_node_mapping_, BLOCK_MARKER);
+    std::string tmp_mark_name = node.as<isl::schedule_node_mark>().get_id().get_name();
+    if (tmp_mark_name != mark_name) {
+      return node;
+    }
 
-  auto tmp_node = res_node;
-  if (node.isa<isl::schedule_node_band>()) {
-    tmp_node = res_node.child(0);
-  }
+    return HoistClusters(node.parent()).child(0);
+  };
 
-  int64_t alloc_threads = 1;
-  if (thread_cfg != nullptr) {
-    for (size_t i = 0; i < thread_cfg->bound; ++i) {
-      alloc_threads *= thread_cfg->GetAt(i).second;
-    }
+  auto node = orig_node;
+  for (auto name : mark_names_) {
+    mark_name = name;
+    node = MapDescendantTopDown(node, GetMarkNode);
   }
+  node = InsertMarkerForEmit(node);
+  node = DeleUselessMarker(node, mark_names_);
+  return node;
+}
 
-  auto partial_sched_mupa = ShortScheduleMupa(root_node, tmp_node);
-  auto partial_sched_with_block = isl::union_map::from(partial_sched_mupa).intersect_domain(block_mapping);
-  partial_sched_mupa = partial_sched_mupa.flat_range_product(block_schedule).flat_range_product(thread_schedule);
-  for (size_t index = 0; index < scop_info_.analysis_result_.buffer_def_infos_.size(); index++) {
-    BufferDefInfo &buffer_info = scop_info_.analysis_result_.buffer_def_infos_[index];
-
-    if (buffer_info.dst_tensor_id.to_str().find(SHARE_SUFFIX) != std::string::npos) {
-      continue;
-    }
+isl::schedule RegisterMemoryManager::HoistRegisterMemory() {
+  isl::schedule_node node = GetOuterBand(schedule_.root());
+  if (node.isa<isl::schedule_node_band>()) {
+    node = HoistRegisterMemoryOnMark(node);
+  } else {
+    int number = static_cast<int>(node.n_children());
+    for (int i = 0, current_band_index = 0; i < number; ++i) {
+      auto promotion_node = node.child(i).child(0);
+      if (promotion_node.isa<isl::schedule_node_leaf>()) continue;
 
-    if (scop_info_.user_config_.GetEnableMatmul() && !hoist_tensor_all_) {
-      if (!hoist_compute_local_tensor_) {
-        if (GetTensorMark(buffer_info.dst_tensor_id.get_name(), scop_info_) == TENSOR_C) {
-          continue;
-        }
-      } else {
-        if (GetTensorMark(buffer_info.dst_tensor_id.get_name(), scop_info_) != TENSOR_C) {
-          continue;
-        }
-      }
+      mark_names_.clear();
+      band_index_ = current_band_index;
+      node = HoistRegisterMemoryOnMark(promotion_node);
+      node = node.parent().parent();
+      ++current_band_index;
     }
+  }
 
-    auto fp_cluster = buffer_info.GetFootPrintClusterGPU(res_node);
+  return node.get_schedule();
+}
 
-    if (fp_cluster == nullptr || !fp_cluster->foot_print_.box.is_valid()) {
-      continue;
-    }
+void RegisterMemoryManager::SetPromotedWriteNameForGemm(std::string &local_tensor_c) {
+  write_name_ = GML_WRITE_ID_NAME;
+  std::string shared_tensors;
+  for (const auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
+    auto cluster_id = buffer.second.cluster_id;
+    shared_tensors += cluster_id.name() + " ";
+  }
+  if (shared_tensors.find(local_tensor_c) != std::string::npos) {
+    write_name_ = SHARED_WRITE_ID_NAME;
+  }
+}
 
-    auto tensor_id = buffer_info.tensor_id;
-    auto box_sizes = fp_cluster->GetFixedBoxSizes();
+void RegisterMemoryManager::CreateClusterForOperator(const isl::schedule_node &node) {
+  RegisterCreateCluster create_cluster(scop_info_, band_index_);
+  if (scop_info_.user_config_.GetEnableMatmul()) {
+    // matmul operator
+    std::string local_tensor_c = GetMatmulTensorsName(scop_info_)[MATRIX_C];
+    SetPromotedWriteNameForGemm(local_tensor_c);
 
-    if (box_sizes.size() == 0) {
-      LOG(FATAL) << "Can not manage a scalar tensor in register memory promotion";
+    auto config_shared_tensors = scop_info_.user_config_.GetSharedTensors();
+    auto c_mark = PROMOTE_GLOBAL_TO_REGISTER_C;
+    if (config_shared_tensors.find(local_tensor_c) != config_shared_tensors.end()) {
+      c_mark = PROMOTE_SHARED_TO_REGISTER_C;
     }
 
-    if (!IsPromote(*fp_cluster, partial_sched_mupa, thread_schedule)) {
-      continue;
-    }
+    mark_names_.emplace(PROMOTE_SHARED_TO_REGISTER_AB);
+    mark_names_.emplace(c_mark);
 
-    if (!scop_info_.user_config_.GetEnableTensorCore() && !scop_info_.user_config_.GetEnableMatmul() &&
-        !scop_info_.user_config_.GetEnableVectorization()) {
-      if (!ReuseTensorCluster(*fp_cluster, partial_sched_mupa)) {
-        continue;
-      }
-    }
+    create_cluster.CreateClusterListForGemm(node, mark_names_);
+  } else {
+    mark_names_.emplace(PROMOTE_GLOBAL_TO_REGISTER);
+    create_cluster.CreateClusterListForElementWise(node, mark_names_);
+  }
+}
 
-    auto tensor_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies<size_t>());
+isl::schedule_node RegisterMemoryManager::InsertMarkerForEmit(const isl::schedule_node &orig_node) {
+  auto node = orig_node;
+  if (scop_info_.user_config_.GetEnableMatmul()) {
     if (scop_info_.user_config_.GetEnableTensorCoreUsePoly()) {
-      tensor_size = (buffer_info.tensor_id.get_name() == local_tensor_c_) ? (tensor_size / alloc_threads)
-                                                                          : (tensor_size * 2 / alloc_threads);
+      node = TileTensorAccordingInterfaceValue(orig_node);
     }
-    auto data_bytes = scop_info_.user_config_.GetDataBytes(buffer_info.tensor_id.get_name());
-    tensor_size = tensor_size * std::max<int>(1, data_bytes / BYTES_PER_REGISTER);
-    size_t memory_requirement = tensor_size * alloc_threads;
-
-    if (memory_requirement <= remain_memory_) {
-      auto active_domains = CollectDomain(res_node);
-      isl::id dst_tensor_id = buffer_info.dst_tensor_id;
-      GatherBufferFootprintDefInfo(res_node, buffer_info);
-      if (scop_info_.user_config_.GetEnableMatmul()) {
-        if (tensor_id.get_name().find(SHARE_SUFFIX) != std::string::npos) {
-          std::shared_ptr<TensorFootprintCluster> src_fp_cluster;
-          isl::union_map sch_map = scop_info_.analysis_result_.GetScheduleMapBeforeTile();
-          for (auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
-            if (tensor_id == buffer.second.cluster_id) {
-              src_fp_cluster = buffer.second.cluster;
-              break;
-            }
-          }
-          if (src_fp_cluster != nullptr) {
-            node = PlaceInnerDataCopyBelow(scop_info_, node, *fp_cluster, *src_fp_cluster, tensor_id, dst_tensor_id,
-                                           tensor_id, sch_map);
-          }
-        } else {
-          node = PlaceOuterDataCopyBelow(scop_info_, node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched,
-                                         schedule_.get_domain().get_space());
-        }
-      } else {
-        node = PlaceOuterDataCopyBelow(scop_info_, node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched,
-                                       schedule_.get_domain().get_space());
-      }
-
-      remain_memory_ -= memory_requirement;
-
-      // active_buffer_footprints for codegen
-      scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair(
-        active_domains, BufferedFootPrintInfo{std::shared_ptr<TensorFootprintCluster>(std::move(fp_cluster)),
-                                              partial_sched, dst_tensor_id}));
-      buffer_info.find_buffer = true;
+    std::string marker_name = PROMOTE_REGISTER_TO_GLOBAL;
+    if (write_name_ == SHARED_WRITE_ID_NAME) {
+      marker_name = PROMOTE_REGISTER_TO_SHARED;
     }
+    node = InsertMarkerForThreadGroup(node, write_name_, marker_name);
+  } else if (current_outer_bn_->enable_vectorization) {
+    node = InsertMarkerForThreadGroup(node, GML_READ_ID_NAME, PROMOTE_VECTORIZATION);
+    node = InsertMarkerForThreadGroup(node, GML_WRITE_ID_NAME, PROMOTE_VECTORIZATION);
   }
-  sch = node.get_schedule();
-  return sch;
+  return node;
 }
 
-/*Check if the given "group" can be promoted to registers for the given
- * mapping to thread identifiers and within the given outer schedule */
-bool RegisterMemoryManager::IsPromote(const TensorFootprintCluster &fp_cluster,
-                                      const isl::multi_union_pw_aff &partial_sched_mupa,
-                                      const isl::multi_union_pw_aff &thread_schedule) {
-  /* compute the mapping relation between single thread and outer schedule space and tensor elements pair */
-  isl::union_map state_schedule_mapping =
-    ScheduleTensorMapping(partial_sched_mupa, fp_cluster.OrigianlAccessRelations());
-  isl::union_map thread_schedule_mapping = state_schedule_mapping.apply_domain(isl::union_map::from(thread_schedule));
-  /* check that whether the mapping relation between single thread
-   * and outer schedule points and group elements pair is injective. */
-  return thread_schedule_mapping.is_injective();
-}
+isl::schedule_node RegisterMemoryManager::HoistClusters(const isl::schedule_node &node) {
+  auto res_node = node;
+  isl::schedule_node root_node = node.get_schedule().get_root();
 
-void RegisterMemoryManager::CreateTensorCluster(const isl::schedule_node &node, const isl::union_map &outer_sch) {
-  isl::union_map reads = scop_info_.analysis_result_.GetReads();
-  isl::union_map writes = scop_info_.analysis_result_.GetWrites();
-  isl::union_map copyin = scop_info_.analysis_result_.GetCopyin();
-  isl::union_map fake_copyin = scop_info_.analysis_result_.GetFakeCopyin();
-
-  auto read_map = scop_info_.StmtReadMap();
-  auto write_map = scop_info_.StmtWriteMap();
-  auto stmt_map = scop_info_.analysis_result_.GetStmtOpInfoMap();
-  std::vector<isl::id> tensor_list;
-  std::unordered_set<isl::id, isl::IslIdIslHash> id_sets;
-  for (auto item : read_map) {
-    for (auto item_id : item.second) {
-      id_sets.insert(item_id);
-    }
-  }
-  for (auto item : write_map) {
-    for (auto item_id : item.second) {
-      id_sets.insert(item_id);
-    }
-  }
+  isl::schedule sch = schedule_;
 
-  std::set<std::string> shared_dst_tensor_ids;
-  for (auto buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
-    shared_dst_tensor_ids.insert(buffer.second.cluster_id.get_name());
-  }
-  if (!configed_tensors_.empty()) {
-    std::set<std::string> tensor_sets;
-    for (const auto &item : configed_tensors_) {
-      if (tensor_sets.count(item) == 0) {
-        tensor_sets.emplace(item);
-      }
-    }
-    id_sets.clear();
-    for (auto item : tensor_sets) {
-      id_sets.insert(isl::id(scop_info_.ctx_, item));
-    }
-  }
+  for (size_t index = 0; index < scop_info_.analysis_result_.buffer_def_infos_.size(); index++) {
+    BufferDefInfo &buffer_info = scop_info_.analysis_result_.buffer_def_infos_[index];
 
-  for (auto item : id_sets) {
-    if (scop_info_.user_config_.GetEnableMatmul()) {
-      tensor_list.push_back(item);
-    } else {
-      if (shared_dst_tensor_ids.count(item.get_name() + SHARE_SUFFIX)) {
-        continue;
-      }
+    if (buffer_info.dst_tensor_id.to_str().find(SHARE_SUFFIX) != std::string::npos) {
+      continue;
+    }
 
-      std::unordered_set<std::string> tensors = scop_info_.analysis_result_.GetTensorsNotPromote();
-      if (tensors.count(item.get_name())) {
-        continue;
-      }
+    auto fp_cluster = buffer_info.GetFootPrintClusterGPU(node);
 
-      tensor_list.push_back(item);
+    if (fp_cluster == nullptr || !fp_cluster->foot_print_.box.is_valid()) {
+      continue;
     }
-  }
 
-  std::vector<BufferDefInfo> promoted_infos;
+    auto tensor_id = buffer_info.tensor_id;
+    RegisterCreateCluster create_cluster(scop_info_, band_index_);
+    isl::union_map partial_sched = create_cluster.GetPartialSchedule(node);
 
-  for (const auto &item : tensor_list) {
-    if (scop_info_.user_config_.GetEnableMatmul() && !hoist_tensor_all_) {
-      if (!hoist_compute_local_tensor_) {
-        if (GetTensorMark(item.get_name(), scop_info_) == TENSOR_C) {
-          continue;
+    auto active_domains = CollectDomain(node);
+    isl::id dst_tensor_id = buffer_info.dst_tensor_id;
+    GatherBufferFootprintDefInfo(node, buffer_info);
+    if (scop_info_.user_config_.GetEnableMatmul()) {
+      if (tensor_id.get_name().find(SHARE_SUFFIX) != std::string::npos) {
+        std::shared_ptr<TensorFootprintCluster> src_fp_cluster;
+        isl::union_map sch_map = scop_info_.analysis_result_.GetScheduleMapBeforeTile();
+        for (auto &buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
+          if (tensor_id == buffer.second.cluster_id) {
+            src_fp_cluster = buffer.second.cluster;
+            break;
+          }
         }
-      } else {
-        if (GetTensorMark(item.get_name(), scop_info_) != TENSOR_C) {
-          continue;
+        if (src_fp_cluster != nullptr) {
+          if (!GetMarkerName(res_node.child(0), PROMOTE_SHARED_TO_REGISTER_C).empty()) {
+            res_node = res_node.child(0).del();
+            res_node = res_node.parent();
+          }
+          res_node = PlaceInnerDataCopyBelow(scop_info_, res_node, *fp_cluster, *src_fp_cluster, tensor_id,
+                                             dst_tensor_id, tensor_id, sch_map);
         }
+      } else {
+        res_node = PlaceOuterDataCopyBelow(scop_info_, res_node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched,
+                                           schedule_.get_domain().get_space());
       }
-    }
-
-    isl::id dst_tensor_id = GetGpuIndexDstId(GpuMemType::LOCAL, item);
-    std::vector<size_t> buffer_sizes;
-    std::vector<std::pair<isl::id, MemType>> data_stream;
-    MemType memtype;
-    BufferDefInfo promoted_info;
-    isl::id tmp_item;
-    if (!shared_dst_tensor_ids.count(item.get_name() + SHARE_SUFFIX)) {
-      tmp_item = item;
-      data_stream.push_back(std::make_pair(item, MemType::DDR));
-      data_stream.push_back(std::make_pair(item, MemType::LOCAL_));
-      memtype = MemType::DDR;
-    } else {
-      tmp_item = isl::id(scop_info_.ctx_, item.get_name() + SHARE_SUFFIX);
-      data_stream.push_back(std::make_pair(item, MemType::SHARED_));
-      data_stream.push_back(std::make_pair(item, MemType::LOCAL_));
-      memtype = MemType::SHARED_;
-    }
-    promoted_info = BufferDefInfo{tmp_item,
-                                  dst_tensor_id,
-                                  tmp_item,
-                                  memtype,
-                                  "",
-                                  false,
-                                  false,
-                                  data_stream,
-                                  Tensor(),
-                                  Handle(),
-                                  buffer_sizes,
-                                  nullptr,
-                                  isl::union_map::empty(isl::space(scop_info_.ctx_, 0))};
-    promoted_info.footprints_cluster =
-      TensorFootprintCluster::HoistBufferFootprintCluster(outer_sch, item, reads, copyin, writes, fake_copyin);
-    if (promoted_info.footprints_cluster != nullptr) {
-      promoted_info.footprint_cluster_map.emplace_back(std::make_pair(node, promoted_info.footprints_cluster));
-      promoted_infos.push_back(promoted_info);
-    }
-  }
-
-  for (auto promoted_info : promoted_infos) {
-    auto name = promoted_info.tensor_id.get_name();
-    if (CheckRAW(name)) {
-      scop_info_.analysis_result_.buffer_def_infos_.insert(scop_info_.analysis_result_.buffer_def_infos_.begin(),
-                                                           promoted_info);
     } else {
-      scop_info_.analysis_result_.buffer_def_infos_.push_back(promoted_info);
-    }
-  }
-}
-
-bool RegisterMemoryManager::CheckRAW(std::string &name) {
-  isl::union_map reads = scop_info_.analysis_result_.GetReads();
-  isl::union_map reads_filter = isl::union_map::empty(scop_info_.GetCtx());
-  auto reads_range = reads.range();
-  isl::union_set reads_uset = isl::union_set::empty(scop_info_.GetCtx());
-  reads_range.foreach_set([this, name, &reads_uset](const isl::set &s) -> void {
-    std::string tensor_name = s.get_tuple_name();
-    if (tensor_name == name) {
-      reads_uset = reads_uset.add_set(s);
-    }
-  });
-
-  reads_filter = reads.intersect_range(reads_uset);
-  isl::union_map writes = scop_info_.analysis_result_.GetWrites();
-  isl::union_map writes_filter = isl::union_map::empty(scop_info_.GetCtx());
-  auto writes_range = writes.range();
-  isl::union_set writes_uset = isl::union_set::empty(scop_info_.GetCtx());
-  writes_range.foreach_set([this, name, &writes_uset](const isl::set &s) -> void {
-    std::string tensor_name = s.get_tuple_name();
-    if (tensor_name == name) {
-      writes_uset = writes_uset.add_set(s);
+      res_node = PlaceOuterDataCopyBelow(scop_info_, res_node, *fp_cluster, tensor_id, dst_tensor_id, partial_sched,
+                                         schedule_.get_domain().get_space());
     }
-  });
 
-  writes_filter = writes.intersect_range(writes_uset);
-  isl::union_map rawmap = ComputeRAW(scop_info_.origin_schedule_, reads_filter, writes_filter);
-  // Input/output tensor has lower priority then temp tensor
-  if (rawmap.is_empty() || scop_info_.IsInBinds(name)) {
-    return false;
-  } else {
-    return true;
+    // active_buffer_footprints for codegen
+    scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair(
+      active_domains, BufferedFootPrintInfo{std::shared_ptr<TensorFootprintCluster>(std::move(fp_cluster)),
+                                            partial_sched, dst_tensor_id}));
+    buffer_info.find_buffer = true;
   }
+  return res_node;
 }
 
 void RegisterMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_node &node, BufferDefInfo &tensor_info) {
@@ -372,211 +232,51 @@ void RegisterMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_nod
   tensor_info.AddSize(node, sizes);
 }
 
-size_t RegisterMemoryManager::UpdateDepth(const isl::schedule_node &node) {
-  auto band = node.as<isl::schedule_node_band>();
-  for (size_t i = 0; i < band.n_member(); i++) {
-    if (!band.member_get_coincident(i)) {
-      if (i == 0) {
-        return band.n_member();
-      } else {
-        return i;
-      }
-    }
-  }
-  return band.n_member() + node.schedule_depth();
-}
-
-isl::schedule RegisterMemoryManager::HoistRegisterMemory(isl::schedule_node root, size_t depth) {
-  auto bands = BandsContainingScheduleDepth(root, depth);
-  bands = FilterWithFunc(
-    [root, depth](isl::schedule_node node) {
-      auto band = node.as<isl::schedule_node_band>();
-      return !IsThreadMappedMark(node) || node.schedule_depth() + band.n_member() == depth;
-    },
-    bands);
-  bands = BandsSplitAfterDepth(bands, root, depth);
-
-  isl::schedule tmp_sch = root.get_schedule();
-  if (!bands.size()) {
-    return tmp_sch;
-  }
-
-  int distance_to_extension = 3;
-  for (auto band : bands) {
-    if (IsThreadMappedMark(band)) {
-      band = band.child(0);
-    }
-
-    if (IsReadOrWriteBand(band)) {
-      continue;
-    }
-
-    if (band.has_parent() && band.parent().has_parent() && band.parent().parent().has_parent() &&
-        band.ancestor(distance_to_extension) &&
-        band.ancestor(distance_to_extension).isa<isl::schedule_node_extension>()) {
-      break;
-    }
-    tmp_sch = HoistRegisterMemoryOnDepth(band, depth);
-    break;
-  }
-  return tmp_sch;
-}
-
-bool RegisterMemoryManager::IsReadOrWriteBand(isl::schedule_node node) {
-  if (node.parent().isa<isl::schedule_node_filter>()) {
-    auto filter = node.parent().as<isl::schedule_node_filter>();
-
-    isl::union_set uset = filter.get_filter();
-    std::vector<isl::set> vset;
-    uset.foreach_set([&vset](isl::set s) { vset.push_back(s); });
-    if (!vset.empty()) {
-      auto filter_name = vset[0].get_tuple_name();
-      if (filter_name == READ_ID_NAME || filter_name == WRITE_ID_NAME) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-isl::schedule_node RegisterMemoryManager::GetRegisterPromotedNode(isl::schedule_node &root) {
-  isl::schedule_node hoist_register_node = root;
-  root.foreach_descendant_top_down([&hoist_register_node, this](const isl::schedule_node &node) -> bool {
-    if (node.isa<isl::schedule_node_sequence>()) {
-      auto sequence_node = node.as<isl::schedule_node_sequence>();
-      if (sequence_node.parent().isa<isl::schedule_node_extension>() &&
-          sequence_node.parent().parent().isa<isl::schedule_node_band>()) {
-        hoist_register_node = sequence_node.parent().parent();
-        return false;
-      } else if (sequence_node.parent().isa<isl::schedule_node_band>()) {
-        hoist_register_node = sequence_node.parent();
-        return false;
-      }
-    }
-
-    if (node.isa<isl::schedule_node_mark>()) {
-      auto mark_node = node.as<isl::schedule_node_mark>();
-      if (scop_info_.user_config_.GetEnableVectorization()) {
-        if (mark_node.get_id().get_name() == THREAD_MARKER &&
-            mark_node.child(0).child(0).isa<isl::schedule_node_band>()) {
-          hoist_register_node = mark_node.child(0).child(0);
-          return false;
-        }
-      } else if (mark_node.get_id().get_name() == THREAD_MARKER && mark_node.parent().isa<isl::schedule_node_band>()) {
-        hoist_register_node = mark_node.parent();
-        return false;
-      }
+// According to the value of the conv interface, the size of the tensor is split to confirm the size of the fragment.
+isl::schedule_node RegisterMemoryManager::TileTensorAccordingInterfaceValue(const isl::schedule_node &orig_node) {
+  CHECK(scop_info_.user_config_.GetReplaceConfig().count(WARP_COMPUTE)) << "Cannot map to warp.";
+  auto CollectReadWriteFilter = [this](isl::schedule_node node) -> isl::schedule_node {
+    if (!node.isa<isl::schedule_node_filter>()) {
+      return node;
     }
-    return true;
-  });
-  return hoist_register_node;
-}
-
-isl::schedule_node RegisterMemoryManager::PromotedNodeUnderSequence(isl::schedule_node_sequence &node) {
-  int band_node_num = 0;
-  auto root = node.get_schedule().get_root();
-  auto tmp_node = root;
-
-  for (size_t i = 0; i < node.n_children(); ++i) {
-    if (IsReadOrWriteBand(node.child(i).child(0))) {
-      continue;
+    bool is_all_sets_read_or_write = IsReadOrWriteTensor(node, SHARED_READ_ID_NAME, write_name_);
+    if (!is_all_sets_read_or_write) {
+      return node;
     }
-    band_node_num += 1;
-    tmp_node = node.child(i);
-  }
 
-  auto hoist_register_node = root;
-  if (band_node_num == 1) {
-    tmp_node.foreach_descendant_top_down([&hoist_register_node](const isl::schedule_node &node) -> bool {
-      if (node.isa<isl::schedule_node_mark>()) {
-        auto mark_node = node.as<isl::schedule_node_mark>();
-        if (mark_node.get_id().get_name() == THREAD_MARKER &&
-            mark_node.child(0).child(0).isa<isl::schedule_node_band>()) {
-          hoist_register_node = mark_node.child(0).child(0);
-          return false;
-        }
-      }
-      return true;
-    });
-  }
-  return hoist_register_node;
-}
+    auto start_depth = node.get_tree_depth();
 
-isl::schedule_node RegisterMemoryManager::GetVectorizationPromotedNode(isl::schedule_node &root) {
-  isl::schedule_node hoist_register_node = root;
-  root.foreach_descendant_top_down([&hoist_register_node, this](const isl::schedule_node &node) -> bool {
-    if (node.isa<isl::schedule_node_sequence>()) {
-      auto sequence_node = node.as<isl::schedule_node_sequence>();
-      if (sequence_node.parent().isa<isl::schedule_node_extension>() &&
-          sequence_node.parent().parent().isa<isl::schedule_node_band>()) {
-        hoist_register_node = PromotedNodeUnderSequence(sequence_node);
-        return false;
-      } else if (sequence_node.parent().isa<isl::schedule_node_band>()) {
-        return false;
-      }
+    auto band_node = GetCanMappingNode(node);
+    std::string id_name = GetPromotionTensorName(band_node, scop_info_.analysis_result_.buffer_def_infos_);
+    if (id_name.empty() || !scop_info_.analysis_result_.GetMatrixMatmulMap().count(id_name) ||
+        !scop_info_.analysis_result_.GetMatrixMatmulMajor().count(id_name)) {
+      return node;
     }
 
-    if (node.isa<isl::schedule_node_mark>()) {
-      auto mark_node = node.as<isl::schedule_node_mark>();
-      if (mark_node.get_id().get_name() == THREAD_MARKER &&
-          mark_node.child(0).child(0).isa<isl::schedule_node_band>()) {
-        hoist_register_node = mark_node.child(0).child(0);
-        return false;
-      }
+    bool is_conv = scop_info_.user_config_.GetEnableConvTensorCore();
+    if (is_conv) {
+      band_node = AdjustConvScheduleTreeStructure(band_node);
     }
-    return true;
-  });
-  return hoist_register_node;
-}
-
-isl::schedule RegisterMemoryManager::HoistRegisterMemoryOnMark(isl::schedule_node root) {
-  std::string config_shared_tensors = scop_info_.user_config_.GetSharedTensors();
-  auto c_mark = PROMOTE_GLOBAL_TO_REGISTER_C;
-  if (config_shared_tensors.find(local_tensor_c_) != std::string::npos) {
-    c_mark = PROMOTE_SHARED_TO_REGISTER_C;
-  }
 
-  auto mark_node = CollectMarkNode(root, c_mark).at(0);
-  auto tmp_hoist_node = mark_node.parent();
-
-  while (!tmp_hoist_node.isa<isl::schedule_node_band>()) {
-    tmp_hoist_node = tmp_hoist_node.parent();
-  }
-
-  auto depth = tmp_hoist_node.child(0).schedule_depth();
-  auto hoist_compute_node = tmp_hoist_node.as<isl::schedule_node_band>();
-  for (size_t i = 0; i < hoist_compute_node.n_member(); ++i) {
-    if (!hoist_compute_node.member_get_coincident(i)) {
-      if (scop_info_.user_config_.GetEnableTensorCoreUsePoly() && i == 0) {
-        hoist_tensor_all_ = true;
-        auto hoist_node = mark_node.del().parent();
-        auto sch = HoistRegisterMemoryOnDepth(hoist_node, depth);
-        return sch;
-      }
-      hoist_compute_node = hoist_compute_node.split(i);
-      depth = depth - hoist_compute_node.n_member() + i;
+    auto mapping_cfg = scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE];
+    CHECK(mapping_cfg != nullptr) << "mapping config is null";
+    // split member that does not involved in thread mapping
+    auto mem_size = band_node.as<isl::schedule_node_band>().n_member();
+    if (mem_size > mapping_cfg->bound) {
+      band_node = band_node.as<isl::schedule_node_band>().split(mem_size - mapping_cfg->bound);
+      band_node = band_node.child(0);
     }
-  }
-  auto sch = HoistRegisterMemoryOnDepth(hoist_compute_node, depth);
 
-  auto hoist_ab_root = sch.get_root();
-  auto ab_mark = PROMOTE_SHARED_TO_REGISTER_AB;
-  auto mark_ab_node = CollectMarkNode(hoist_ab_root, ab_mark).at(0);
-  auto hoist_ab_node = mark_ab_node.del().parent();
-  auto hoist_ab_depth = hoist_ab_node.schedule_depth();
-  hoist_compute_local_tensor_ = false;
-  sch = HoistRegisterMemoryOnDepth(hoist_ab_node, hoist_ab_depth);
+    std::string matrix_name = scop_info_.analysis_result_.GetMatrixMatmulMap()[id_name];
+    std::string matrix_major = scop_info_.analysis_result_.GetMatrixMatmulMajor()[id_name];
+    isl::multi_val tile_size_val = GetRealTileSizeVal(band_node, matrix_name, matrix_major);
+    band_node = TileBand(band_node, tile_size_val);
 
-  return sch;
-}
+    node = band_node.ancestor(band_node.get_tree_depth() - start_depth);
+    return node;
+  };
 
-std::string RegisterMemoryManager::GetPromotedWriteName() {
-  std::string write_name = GML_WRITE_ID_NAME;
-  std::string shared_tensors = shared_tensors_;
-  if (shared_tensors.find(local_tensor_c_) != std::string::npos) {
-    write_name = SHARED_WRITE_ID_NAME;
-  }
-  return write_name;
+  return orig_node.map_descendant_bottom_up(CollectReadWriteFilter);
 }
 
 isl::schedule_node RegisterMemoryManager::AdjustConvScheduleTreeStructure(const isl::schedule_node &orig_node) {
@@ -632,54 +332,6 @@ isl::schedule_node RegisterMemoryManager::AdjustConvScheduleTreeStructure(const
   return band_node;
 }
 
-// According to the value of the conv interface, the size of the tensor is split to confirm the size of the fragment.
-isl::schedule_node RegisterMemoryManager::TileTensorAccordingInterfaceValue(isl::schedule_node &root) {
-  CHECK(scop_info_.user_config_.GetReplaceConfig().count(WARP_COMPUTE)) << "Cannot map to warp.";
-  std::string write_name = GetPromotedWriteName();
-  auto CollectReadWriteFilter = [this, write_name](isl::schedule_node node) -> isl::schedule_node {
-    if (!node.isa<isl::schedule_node_filter>()) {
-      return node;
-    }
-    bool is_all_sets_read_or_write = IsReadOrWriteTensor(node, SHARED_READ_ID_NAME, write_name);
-    if (!is_all_sets_read_or_write) {
-      return node;
-    }
-
-    auto start_depth = node.get_tree_depth();
-
-    auto band_node = GetCanMappingNode(node);
-    std::string id_name = GetPromotionTensorName(band_node, scop_info_.analysis_result_.buffer_def_infos_);
-    if (id_name.empty() || !scop_info_.analysis_result_.GetMatrixMatmulMap().count(id_name) ||
-        !scop_info_.analysis_result_.GetMatrixMatmulMajor().count(id_name)) {
-      return node;
-    }
-
-    bool is_conv = scop_info_.user_config_.GetEnableConvTensorCore();
-    if (is_conv) {
-      band_node = AdjustConvScheduleTreeStructure(band_node);
-    }
-
-    auto mapping_cfg = scop_info_.user_config_.GetReplaceConfig()[WARP_COMPUTE];
-    CHECK(mapping_cfg != nullptr) << "mapping config is null";
-    // split member that does not involved in thread mapping
-    auto mem_size = band_node.as<isl::schedule_node_band>().n_member();
-    if (mem_size > mapping_cfg->bound) {
-      band_node = band_node.as<isl::schedule_node_band>().split(mem_size - mapping_cfg->bound);
-      band_node = band_node.child(0);
-    }
-
-    std::string matrix_name = scop_info_.analysis_result_.GetMatrixMatmulMap()[id_name];
-    std::string matrix_major = scop_info_.analysis_result_.GetMatrixMatmulMajor()[id_name];
-    isl::multi_val tile_size_val = GetRealTileSizeVal(band_node, matrix_name, matrix_major);
-    band_node = TileBand(band_node, tile_size_val);
-
-    node = band_node.ancestor(band_node.get_tree_depth() - start_depth);
-    return node;
-  };
-
-  return root.map_descendant_bottom_up(CollectReadWriteFilter);
-}
-
 isl::multi_val RegisterMemoryManager::GetRealTileSizeVal(const isl::schedule_node &node, const std::string &matrix_name,
                                                          const std::string &matrix_major) {
   auto ctx = node.ctx();
@@ -713,129 +365,6 @@ isl::multi_val RegisterMemoryManager::GetRealTileSizeVal(const isl::schedule_nod
   return tile_size_val;
 }
 
-isl::schedule RegisterMemoryManager::RunMatmul(isl::schedule_node root) {
-  GetActualPromotedSharedTensors();
-  auto sch = HoistRegisterMemoryOnMark(root);
-  if (scop_info_.user_config_.GetEnableTensorCoreUsePoly()) {
-    root = sch.get_root();
-    sch = TileTensorAccordingInterfaceValue(root).get_schedule();
-  }
-  std::string write_name = GetPromotedWriteName();
-  std::string marker_name = PROMOTE_REGISTER_TO_GLOBAL;
-  if (write_name == SHARED_WRITE_ID_NAME) {
-    marker_name = PROMOTE_REGISTER_TO_SHARED;
-  }
-  sch = InsertMarkerForThreadGroup(sch, write_name, marker_name);
-  return sch;
-}
-
-isl::schedule RegisterMemoryManager::RunReduce(isl::schedule_node root) {
-  auto sch = root.get_schedule();
-  auto res_node = GetRegisterPromotedNode(root);
-  if (res_node.isa<isl::schedule_node_band>()) {
-    auto depth = UpdateDepth(res_node);
-    sch = HoistRegisterMemory(root, depth);
-  }
-  return sch;
-}
-
-isl::schedule RegisterMemoryManager::RunElementWise(isl::schedule_node root) {
-  auto sch = root.get_schedule();
-  auto CollectGMLReadWriterFilter = [this](isl::schedule_node node) -> isl::schedule_node {
-    if (!node.isa<isl::schedule_node_filter>()) {
-      return node;
-    }
-
-    bool is_all_sets_read_or_write = IsReadOrWriteTensor(node, GML_READ_ID_NAME, GML_WRITE_ID_NAME);
-    if (!is_all_sets_read_or_write) {
-      return node;
-    }
-
-    auto filter = node.as<isl::schedule_node_filter>().filter();
-    auto filter_set = filter.unwrap();
-    bool is_vectorization_tensor = false;
-    filter_set.range().foreach_set([this, &is_vectorization_tensor](const isl::set &s) -> void {
-      std::string promoted_tensor = s.get_tuple_name();
-      for (auto buffer : scop_info_.analysis_result_.active_buffer_footprints_) {
-        auto cluster_id = buffer.second.cluster_id;
-        if (cluster_id.get_name() == promoted_tensor) {
-          auto cluster = buffer.second.cluster;
-          auto box_sizes = cluster->GetFixedBoxSizes();
-          auto local_size = 1;
-          for (auto i : box_sizes) {
-            local_size = local_size * i;
-          }
-          if (local_size == 4 || local_size == 8) {
-            // vectorization mode fp32 or fp16
-            is_vectorization_tensor = true;
-          }
-        }
-      }
-    });
-
-    if (!is_vectorization_tensor) {
-      return node;
-    }
-
-    if (node.n_children() > 0 && node.child(0).isa<isl::schedule_node_band>()) {
-      node = node.child(0).insert_mark(PROMOTE_VECTORIZATION);
-      node = node.parent();
-    }
-    return node;
-  };
-
-  isl::schedule_node res_node = root;
-  if (scop_info_.user_config_.GetEnableVectorization()) {
-    res_node = GetVectorizationPromotedNode(root);
-    if (res_node.isa<isl::schedule_node_domain>()) {
-      return sch;
-    }
-  } else {
-    res_node = GetRegisterPromotedNode(root);
-  }
-
-  if (res_node.isa<isl::schedule_node_band>()) {
-    auto depth = UpdateDepth(res_node);
-
-    sch = HoistRegisterMemory(root, depth);
-
-    if (scop_info_.user_config_.GetEnableVectorization()) {
-      auto tmp_root = sch.get_root();
-      tmp_root = tmp_root.map_descendant_bottom_up(CollectGMLReadWriterFilter);
-      sch = tmp_root.get_schedule();
-    }
-  }
-  return sch;
-}
-
-isl::schedule RegisterMemoryManager::Run(isl::schedule sch) {
-  if (!scop_info_.user_config_.GetLocalTensors().empty()) {
-    configed_tensors_ = Split(scop_info_.user_config_.GetLocalTensors(), " ");
-  }
-  if (scop_info_.user_config_.GetEnableMatmul()) {
-    local_tensor_c_ = GetMatmulTensorsName(scop_info_)[MATRIX_C];
-  }
-
-  sch = InsertContextNode(sch, scop_info_);
-
-  if (!scop_info_.user_config_.GetUseRegisterMemory()) {
-    return sch;
-  }
-
-  schedule_ = sch;
-  auto root = sch.get_root();
-
-  if (scop_info_.user_config_.GetEnableMatmul()) {
-    sch = RunMatmul(root);
-  } else if (scop_info_.analysis_result_.GetUseGpuReduceLib()) {
-    sch = RunReduce(root);
-  } else {
-    sch = RunElementWise(root);
-  }
-
-  return sch;
-}
-
 }  // namespace poly
 }  // namespace ir
 }  // namespace akg
diff --git a/src/poly/schedule_pass_gpu/register_memory_manager.h b/src/poly/schedule_pass_gpu/register_memory_manager.h
index 9dee0478fef06b240899c0c46be67c851b56a079..62b6b2889d99853b1a61e2eaafc5540ee8c12cf1 100644
--- a/src/poly/schedule_pass_gpu/register_memory_manager.h
+++ b/src/poly/schedule_pass_gpu/register_memory_manager.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,62 +36,40 @@ class RegisterMemoryManager : public SchedulePass {
   explicit RegisterMemoryManager(PassInfo &pass_info, ScopInfo &scop_info)
       : pass_info_(pass_info), scop_info_(scop_info) {
     pass_name_ = __FUNCTION__;
-    remain_memory_ = MAX_REGISTER_PER_THREAD_BLOCK * REGISTER_ALLOC_RATIO;
   };
   ~RegisterMemoryManager() {}
 
   virtual isl::schedule Run(isl::schedule sch);
 
-  isl::schedule HoistRegisterMemoryOnDepth(isl::schedule_node &node, size_t depth);
+ private:
+  isl::schedule HoistRegisterMemory();
+  isl::schedule_node HoistRegisterMemoryOnMark(const isl::schedule_node &orig_node);
+  isl::union_map GetPartialSchedule(const isl::schedule_node &node);
+  isl::schedule_node HoistClusters(const isl::schedule_node &node);
 
-  void CreateTensorCluster(const isl::schedule_node &node, const isl::union_map &outer_sch);
+  void CreateClusterForOperator(const isl::schedule_node &node);
 
   void GatherBufferFootprintDefInfo(const isl::schedule_node &node, BufferDefInfo &tensor_info);
 
-  bool IsPromote(const TensorFootprintCluster &fp_cluster, const isl::multi_union_pw_aff &partial_sched_mupa,
-                 const isl::multi_union_pw_aff &thread_schedule);
-
   bool UnrolledLoop(const TensorFootprintCluster &fp_cluster);
 
-  isl::schedule HoistRegisterMemory(isl::schedule_node root, size_t depth);
-
-  size_t UpdateDepth(const isl::schedule_node &root);
-
   isl::schedule_node GetRegisterPromotedNode(isl::schedule_node &root);
-  isl::schedule HoistRegisterMemoryOnMark(isl::schedule_node root);
 
   isl::schedule_node AdjustConvScheduleTreeStructure(const isl::schedule_node &orig_node);
-  isl::schedule_node TileTensorAccordingInterfaceValue(isl::schedule_node &root);
+  isl::schedule_node TileTensorAccordingInterfaceValue(const isl::schedule_node &orig_node);
   isl::multi_val GetRealTileSizeVal(const isl::schedule_node &node, const std::string &matrix_name,
                                     const std::string &matrix_major);
-  std::string GetPromotedWriteName();
-
-  void GetActualPromotedSharedTensors();
-
-  bool IsReadOrWriteBand(isl::schedule_node node);
-
-  isl::schedule_node GetVectorizationPromotedNode(isl::schedule_node &root);
+  void SetPromotedWriteNameForGemm(std::string &local_tensor_c);
+  isl::schedule_node InsertMarkerForEmit(const isl::schedule_node &orig_node);
 
-  isl::schedule_node PromotedNodeUnderSequence(isl::schedule_node_sequence &node);
-
-  isl::schedule RunMatmul(isl::schedule_node root);
-
-  isl::schedule RunReduce(isl::schedule_node root);
-
-  isl::schedule RunElementWise(isl::schedule_node root);
-
-  bool CheckRAW(std::string &name);
-
- private:
   PassInfo &pass_info_;
   ScopInfo &scop_info_;
   isl::schedule schedule_;
-  std::vector<std::string> configed_tensors_;
-  bool hoist_compute_local_tensor_{true};
-  bool hoist_tensor_all_{false};
-  std::string local_tensor_c_;
-  std::string shared_tensors_;
-  size_t remain_memory_{0};
+  std::string write_name_;
+
+  int band_index_{0};
+  OuterBandNode *current_outer_bn_{nullptr};
+  std::unordered_set<std::string> mark_names_;
 };
 
 }  // namespace poly
diff --git a/src/poly/schedule_pass_gpu/shared_memory_manager.cc b/src/poly/schedule_pass_gpu/shared_memory_manager.cc
index 82f438d3164df1817fce43ed626a08c2d5275ed4..2fd4df661cbfb55705b8982ac480e5653d99b399 100644
--- a/src/poly/schedule_pass_gpu/shared_memory_manager.cc
+++ b/src/poly/schedule_pass_gpu/shared_memory_manager.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "poly/schedule_pass_gpu/operator_shared_strategy.h"
+#include "poly/create_cluster.h"
 #include "poly/schedule_pass_gpu/operator_mapping_strategy.h"
 #include "shared_memory_manager.h"
 #include "poly/schedule_tree_util.h"
@@ -28,41 +28,34 @@ namespace ir {
 namespace poly {
 
 isl::schedule SharedMemoryManager::Run(isl::schedule sch) {
-  if (!scop_info_.user_config_.GetSharedTensors().empty()) {
-    configed_tensors_ = Split(scop_info_.user_config_.GetSharedTensors(), " ");
-  }
   if (!scop_info_.user_config_.GetUseSharedMemory()) {
     return sch;
   }
   schedule_ = sch;
   PrepareInfoForPromotion();
   schedule_ = HoistSharedMemory();
-  if (scop_info_.user_config_.GetEnableMatmul()) {
-    schedule_ = InsertMarkerForThreadGroup(schedule_, WRITE_ID_NAME, PROMOTE_SHARED_TO_GLOBAL);
-  }
   schedule_ = InsertContextNode(schedule_, scop_info_);
   return schedule_;
 }
 
 void SharedMemoryManager::PrepareInfoForPromotion() {
   // Update the variable/tensor to share
-  if (!scop_info_.user_config_.GetSharedTensors().empty()) {
-    configed_tensors_ = Split(scop_info_.user_config_.GetSharedTensors(), " ");
-  }
-
+  configed_tensors_ = scop_info_.user_config_.GetSharedTensors();
   bank_conflict_ = scop_info_.user_config_.GetEnableBankConflict();
   shared_inversed_thread_map_ = scop_info_.user_config_.GetSharedInversedThreadMap();
   shared_vector_align_ = scop_info_.user_config_.GetSharedVectorAlign();
-  if (scop_info_.user_config_.GetVectorLength() && !scop_info_.user_config_.GetEnableVectorization() &&
-      !scop_info_.user_config_.EnableStitchFusion()) {
-    scop_info_.user_config_.SetEnableOneDimThread(true);
-  }
   unroll_shared_ = scop_info_.user_config_.GetUnrollShared();
 }
 
-isl::schedule SharedMemoryManager::HoistSharedMemory() {
+isl::schedule_node SharedMemoryManager::HoistSharedMemoryOnMark(const isl::schedule_node &orig_node) {
+  current_outer_bn_ = scop_info_.analysis_result_.GetOuterBandNode(band_index_);
+  if (!current_outer_bn_->use_shared_memory) {
+    return orig_node;
+  }
+  CreateClusterForOperator(orig_node);
+
   std::string mark_name = "";
-  auto HoistSharedMemoryOnMark = [this, &mark_name](isl::schedule_node node) -> isl::schedule_node {
+  auto GetMarkNode = [this, &mark_name](isl::schedule_node node) -> isl::schedule_node {
     if (!node.isa<isl::schedule_node_mark>()) {
       return node;
     }
@@ -79,25 +72,22 @@ isl::schedule SharedMemoryManager::HoistSharedMemory() {
     return HoistClusters(node.parent()).child(0);
   };
 
-  auto HoistCoreFunc = [this, HoistSharedMemoryOnMark,
-                        &mark_name](const isl::schedule_node &orig_node) -> isl::schedule_node {
-    current_outer_bn_ = scop_info_.analysis_result_.GetOuterBandNode(band_index_);
-    if (!current_outer_bn_->use_shared_memory) {
-      return orig_node;
-    }
-    CreateClusterForOperator(orig_node);
-    auto node = orig_node;
-    for (auto name : mark_names_) {
-      mark_name = name;
-      node = MapDescendantTopDown(node, HoistSharedMemoryOnMark);
-    }
-    node = MapCopiesToThreads(node, unroll_shared_);
-    return node;
-  };
+  auto node = orig_node;
+  for (auto name : mark_names_) {
+    mark_name = name;
+    node = MapDescendantTopDown(node, GetMarkNode);
+  }
+  node = MapCopiesToThreads(node, unroll_shared_);
+  node = InsertMarkerForRegisterPromotion(node);
+  node = DeleUselessMarker(node, mark_names_);
 
+  return node;
+}
+
+isl::schedule SharedMemoryManager::HoistSharedMemory() {
   isl::schedule_node node = GetOuterBand(schedule_.root());
   if (node.isa<isl::schedule_node_band>()) {
-    node = HoistCoreFunc(node);
+    node = HoistSharedMemoryOnMark(node);
   } else {
     int number = static_cast<int>(node.n_children());
     for (int i = 0, current_band_index = 0; i < number; ++i) {
@@ -107,7 +97,8 @@ isl::schedule SharedMemoryManager::HoistSharedMemory() {
       remain_memory_ = akg::common::SHARED_MEMORY_SIZE;
       mark_names_.clear();
       band_index_ = current_band_index;
-      node = HoistCoreFunc(promotion_node).ancestor(2);
+      node = HoistSharedMemoryOnMark(promotion_node);
+      node = node.parent().parent();
       ++current_band_index;
     }
   }
@@ -116,15 +107,13 @@ isl::schedule SharedMemoryManager::HoistSharedMemory() {
 }
 
 void SharedMemoryManager::CreateClusterForOperator(const isl::schedule_node &node) {
+  SharedCreateCluster create_cluster(scop_info_, band_index_);
   if (scop_info_.analysis_result_.GetUseGpuReduceLib()) {
     // reduce operator
-    is_reduce_ = true;
     mark_names_.emplace(PROMOTE_GLOBAL_TO_SHARED);
-    ReduceSharedStrategy reduce_op(scop_info_, mark_names_, band_index_);
-    reduce_op.CreateClusterList(node);
+    create_cluster.CreateClusterListForReduce(node, mark_names_);
   } else if (scop_info_.user_config_.GetEnableMatmul()) {
     // matmul operator
-    is_matmul_ = true;
     remain_memory_ = akg::common::ADVANCED_SHARED_MEMORY_SIZE;
 
     auto tensor_c_name = GetMatmulTensorsName(scop_info_)[MATRIX_C];
@@ -133,13 +122,49 @@ void SharedMemoryManager::CreateClusterForOperator(const isl::schedule_node &nod
     }
     mark_names_.emplace(PROMOTE_GLOBAL_TO_SHARED_AB);
 
-    BatchMatmulSharedStrategy matmul_op(scop_info_, mark_names_, band_index_);
-    matmul_op.CreateClusterList(node);
+    create_cluster.CreateClusterListForGemm(node, mark_names_);
   } else {
     mark_names_.emplace(PROMOTE_GLOBAL_TO_SHARED);
-    OperatorSharedStrategy other_op(scop_info_, mark_names_, band_index_);
-    other_op.CreateClusterList(node);
+    create_cluster.CreateClusterListForElementWise(node, mark_names_);
+  }
+}
+
+isl::schedule_node SharedMemoryManager::InsertMarkerForRegisterPromotion(const isl::schedule_node &orig_node) {
+  isl::schedule_node hoist_register_node = orig_node;
+
+  if (scop_info_.user_config_.GetEnableMatmul()) {
+    if (mark_names_.find(PROMOTE_GLOBAL_TO_SHARED_C) != mark_names_.end()) {
+      hoist_register_node = orig_node.child(0).insert_mark(PROMOTE_SHARED_TO_REGISTER_C);
+    }
+    hoist_register_node = InsertMarkerForThreadGroup(hoist_register_node, WRITE_ID_NAME, PROMOTE_SHARED_TO_GLOBAL);
+    return ReplaceMarker(hoist_register_node, PROMOTE_GLOBAL_TO_SHARED_AB, SHARED_MEM_PROMOTED_COMPLETE);
+  } else if (current_outer_bn_->enable_vectorization) {
+    return orig_node;
   }
+
+  size_t start_depth = orig_node.get_tree_depth();
+
+  orig_node.foreach_descendant_top_down([&hoist_register_node, this](const isl::schedule_node &node) -> bool {
+    if (node.isa<isl::schedule_node_sequence>()) {
+      auto sequence_node = node.as<isl::schedule_node_sequence>();
+      if (sequence_node.parent().isa<isl::schedule_node_extension>()) {
+        hoist_register_node = sequence_node.parent().insert_mark(PROMOTE_GLOBAL_TO_REGISTER);
+        return false;
+      }
+    }
+
+    if (node.isa<isl::schedule_node_mark>()) {
+      auto mark_node = node.as<isl::schedule_node_mark>();
+      if (mark_node.get_id().get_name() == THREAD_MARKER) {
+        hoist_register_node = mark_node.insert_mark(PROMOTE_GLOBAL_TO_REGISTER);
+        return false;
+      }
+    }
+    return true;
+  });
+
+  hoist_register_node = hoist_register_node.ancestor(hoist_register_node.get_tree_depth() - start_depth);
+  return hoist_register_node;
 }
 
 isl::schedule_node SharedMemoryManager::MapCopiesToThreads(const isl::schedule_node &orig_node, bool unroll) {
@@ -154,11 +179,14 @@ isl::schedule_node SharedMemoryManager::MapCopiesToThreads(const isl::schedule_n
     }
 
     auto band_node = GetCanMappingNode(node);
-
     auto thread_cfg = scop_info_.user_config_.GetThreadConfig();
-
     auto mapping_cfg = thread_cfg;
 
+    if (!scop_info_.user_config_.EnableStitchFusion() && !current_outer_bn_->enable_vectorization &&
+        scop_info_.user_config_.GetVectorLength()) {
+      scop_info_.user_config_.SetEnableOneDimThread(true);
+    }
+
     if (scop_info_.user_config_.GetEnableOneDimThread()) {
       mapping_cfg = GetCurrentConfig(band_node);
 
@@ -216,7 +244,7 @@ isl::schedule_node SharedMemoryManager::MapCopiesToThreads(const isl::schedule_n
     others_op.SetRequiredMappingCfg(band_node);
     // Map band under thread_root from inner dim to outer dim.
     band_node = others_op.MapDimToThreadsBlocks(band_node);
-    if (is_reduce_) {
+    if (scop_info_.analysis_result_.GetUseGpuReduceLib()) {
       std::string atomic_type = InAtomicTensors(node);
       auto InsertAtomicMarker = [atomic_type, this](isl::schedule_node atomic_node) -> isl::schedule_node {
         if (atomic_type != "" && atomic_node.has_children() && atomic_node.child(0).isa<isl::schedule_node_filter>()) {
@@ -325,7 +353,7 @@ void SharedMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_node
   isl::id tensor_id = tensor_info.tensor_id;
   Type type = scop_info_.GetDtypeOf(tensor_id);
 
-  if (is_matmul_ && tensor_id.get_name() == GetMatmulTensorsName(scop_info_)[MATRIX_C]) {
+  if (scop_info_.user_config_.GetEnableMatmul() && tensor_id.get_name() == GetMatmulTensorsName(scop_info_)[MATRIX_C]) {
     sizes.back() += 8;
   }
 
@@ -356,50 +384,9 @@ void SharedMemoryManager::GatherBufferFootprintDefInfo(const isl::schedule_node
 }
 
 isl::schedule_node SharedMemoryManager::HoistClusters(const isl::schedule_node &node) {
-  auto partial_sched_mupa = ShortScheduleMupa(schedule_.root(), node);
-
-  std::vector<BufferDefInfo> buffer_def_infos_origin;
-  std::vector<BufferDefInfo> buffer_def_infos_temp;
-  auto origin_binds = scop_info_.user_config_.GetOriginBind();
-  std::unordered_set<std::string> tensor_name;
-
-  for (auto i : origin_binds) {
-    if (!i.first.defined()) continue;
-    tensor_name.insert(i.first->op->name);
-  }
-
+  auto res_node = node;
   for (size_t index = 0; index < scop_info_.analysis_result_.buffer_def_infos_.size(); index++) {
     BufferDefInfo &buffer_info = scop_info_.analysis_result_.buffer_def_infos_[index];
-    if (tensor_name.count(buffer_info.tensor_id.get_name())) {
-      buffer_def_infos_origin.push_back(buffer_info);
-    } else {
-      buffer_def_infos_temp.push_back(buffer_info);
-    }
-  }
-
-  auto res_node = node;
-  if (scop_info_.analysis_result_.GetTensorOfTensor()) {
-    SharedPromotion(buffer_def_infos_temp, res_node, node, partial_sched_mupa);
-    SharedPromotion(buffer_def_infos_origin, res_node, node, partial_sched_mupa);
-
-    scop_info_.analysis_result_.buffer_def_infos_.clear();
-    for (auto &b : buffer_def_infos_temp) {
-      scop_info_.analysis_result_.buffer_def_infos_.push_back(b);
-    }
-    for (auto &b : buffer_def_infos_origin) {
-      scop_info_.analysis_result_.buffer_def_infos_.push_back(b);
-    }
-  } else {
-    SharedPromotion(scop_info_.analysis_result_.buffer_def_infos_, res_node, node, partial_sched_mupa);
-  }
-  return res_node;
-}
-
-void SharedMemoryManager::SharedPromotion(std::vector<BufferDefInfo> &bd, isl::schedule_node &res_node,
-                                          const isl::schedule_node &node,
-                                          const isl::multi_union_pw_aff &partial_sched_mupa) {
-  for (size_t index = 0; index < bd.size(); index++) {
-    BufferDefInfo &buffer_info = bd[index];
     auto fp_cluster = buffer_info.GetFootPrintClusterGPU(node);
     if ((fp_cluster == nullptr || !fp_cluster->foot_print_.box.is_valid())) {
       continue;
@@ -422,32 +409,24 @@ void SharedMemoryManager::SharedPromotion(std::vector<BufferDefInfo> &bd, isl::s
     auto approximation_size = std::accumulate(box_sizes.begin(), box_sizes.end(), 1, std::multiplies<size_t>());
     size_t byte = Bytes(id);
     size_t memory_requirement = approximation_size * byte;
-    bool use_reuse_filter = true;
-    if (InAtomicTensors(buffer_info.tensor_id.name()) || InReduceTensors(buffer_info.tensor_id.name()) || is_matmul_ ||
-        current_outer_bn_->template_type == Template::TRANSPOSE_OP) {
-      use_reuse_filter = false;
-    }
-    bool is_injective = !ReuseTensorCluster(*fp_cluster, partial_sched_mupa);
 
-    if (memory_requirement < remain_memory_) {
-      bool need_shared_memory = !use_reuse_filter || !is_injective || CoalescingAccessWay(res_node, *fp_cluster);
-      if (!need_shared_memory) {
-        continue;
-      }
-      GatherBufferFootprintDefInfo(res_node, buffer_info);
-      auto dst_id = buffer_info.dst_tensor_id;
-      res_node = HoistToBlockThreadMemory(res_node, GpuMemType::SHARED, id, dst_id, *(fp_cluster), true);
-      remain_memory_ -= memory_requirement;
-
-      // collect active_buffer_footprints_ info for codegen
-      auto out_schedule = LocalSchedule(res_node);
-      auto active_domains = CollectDomain(res_node);
-      scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair(
-        active_domains,
-        BufferedFootPrintInfo{std::shared_ptr<TensorFootprintCluster>(std::move(fp_cluster)), out_schedule, dst_id}));
-      buffer_info.find_buffer = true;
+    if (memory_requirement >= remain_memory_) {
+      continue;
     }
+    GatherBufferFootprintDefInfo(res_node, buffer_info);
+    auto dst_id = buffer_info.dst_tensor_id;
+    res_node = HoistToBlockThreadMemory(res_node, GpuMemType::SHARED, id, dst_id, *(fp_cluster), true);
+    remain_memory_ -= memory_requirement;
+
+    // collect active_buffer_footprints_ info for codegen
+    auto out_schedule = LocalSchedule(res_node);
+    auto active_domains = CollectDomain(res_node);
+    scop_info_.analysis_result_.active_buffer_footprints_.emplace_back(std::make_pair(
+      active_domains,
+      BufferedFootPrintInfo{std::shared_ptr<TensorFootprintCluster>(std::move(fp_cluster)), out_schedule, dst_id}));
+    buffer_info.find_buffer = true;
   }
+  return res_node;
 }
 
 isl::schedule_node SharedMemoryManager::HoistToBlockThreadMemory(isl::schedule_node &tree, GpuMemType type,
@@ -465,42 +444,6 @@ isl::schedule_node SharedMemoryManager::HoistToBlockThreadMemory(isl::schedule_n
   return res_node;
 }
 
-bool SharedMemoryManager::CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster) {
-  isl::union_map original = cluster.OrigianlAccessRelations();
-  size_t tensor_dim = cluster.foot_print_.GetBoxDim();
-  std::vector<isl::schedule_node> thread_marker = CollectFnNode(IsThreadMappedMark, schedule_.root());
-  for (auto item : thread_marker) {
-    if (!(item.isa<isl::schedule_node_mark>()) && !(item.has_children()) &&
-        !(item.child(0).isa<isl::schedule_node_filter>())) {
-      continue;
-    }
-    isl::schedule_node thread_filter = item.child(0);
-    if (!thread_filter.has_children()) {
-      continue;
-    }
-    isl::schedule_node thread_band = thread_filter.child(0);
-    if (!thread_band.has_children()) {
-      continue;
-    }
-    isl::schedule_node inner_band = thread_band.child(0);
-    size_t num_mapped_thread = inner_band.schedule_depth() - thread_band.schedule_depth();
-    if (num_mapped_thread == 0) {
-      continue;
-    }
-    size_t inner_depth = inner_band.schedule_depth();
-    auto active_domains = CollectDomain(thread_band);
-    auto local_access = original.intersect_domain(active_domains);
-    auto schedule = ShortSchedule(inner_band);
-    auto schedule_access = local_access.apply_domain(schedule);
-    for (auto access : schedule_access.get_map_list()) {
-      if (!IsSubsetForIncreaseDim(access, tensor_dim - 1, inner_depth - 1)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 std::string SharedMemoryManager::InAtomicTensors(isl::schedule_node &node) {
   if (!node.isa<isl::schedule_node_filter>()) {
     return "";
diff --git a/src/poly/schedule_pass_gpu/shared_memory_manager.h b/src/poly/schedule_pass_gpu/shared_memory_manager.h
index dbacf2e1fd0695a7acb85a04a61b1a8290b38630..81d0cfbe074e276bb7b945ecb91cad3291961f6d 100644
--- a/src/poly/schedule_pass_gpu/shared_memory_manager.h
+++ b/src/poly/schedule_pass_gpu/shared_memory_manager.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ class SharedMemoryManager : public SchedulePass {
 
   virtual isl::schedule Run(isl::schedule sch);
 
+ private:
   void PrepareInfoForPromotion();
 
   // create cluster
@@ -43,16 +44,15 @@ class SharedMemoryManager : public SchedulePass {
 
   // promotion core function
   isl::schedule HoistSharedMemory();
+  isl::schedule_node HoistSharedMemoryOnMark(const isl::schedule_node &orig_node);
   void GatherBufferFootprintDefInfo(const isl::schedule_node &node, BufferDefInfo &tensor_info);
   isl::schedule_node HoistClusters(const isl::schedule_node &node);
 
-  void SharedPromotion(std::vector<BufferDefInfo> &bd, isl::schedule_node &res_node, const isl::schedule_node &node,
-                       const isl::multi_union_pw_aff &partial_sched_mupa);
   isl::schedule_node HoistToBlockThreadMemory(isl::schedule_node &tree, GpuMemType type, const isl::id &tensor_id,
                                               const isl::id &dst_tensor_id, TensorFootprintCluster &cluster,
                                               bool force_last_extension_odd);
   size_t Bytes(const isl::id tensor_id);
-  bool CoalescingAccessWay(const isl::schedule_node &node, const TensorFootprintCluster &cluster);
+  isl::schedule_node InsertMarkerForRegisterPromotion(const isl::schedule_node &orig_node);
 
   // Other optimization
   void OptimizeSharedDimension(std::vector<size_t> &sizes, Type type);
@@ -69,10 +69,9 @@ class SharedMemoryManager : public SchedulePass {
   bool InReduceTensors(const std::string &name);
   std::string AtomicMarker(const std::string &type);
 
- private:
   ScopInfo &scop_info_;
   isl::schedule schedule_;
-  std::vector<std::string> configed_tensors_;
+  std::unordered_set<std::string> configed_tensors_;
   bool bank_conflict_{false};
   bool shared_inversed_thread_map_{false};
   int shared_vector_align_{0};
diff --git a/src/poly/schedule_tree_util.cc b/src/poly/schedule_tree_util.cc
index c70ffac038e5674d3c7169f58793db226e02e210..8ac754ec760dacee561a492ed821d7c74235110d 100644
--- a/src/poly/schedule_tree_util.cc
+++ b/src/poly/schedule_tree_util.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -222,27 +222,26 @@ std::vector<isl::schedule_node> BandsSplitAfterDepth(const std::vector<isl::sche
   return MapWithFunc(split_at_depth, bands);
 }
 
-isl::schedule InsertMarkerForThreadGroup(const isl::schedule &sch, const std::string &write_name,
-                                         const std::string &marker_name) {
-  auto GetPromotedWriteFilter = [write_name, marker_name](isl::schedule_node node) -> isl::schedule_node {
+isl::schedule_node InsertMarkerForThreadGroup(const isl::schedule_node &orig_node, const std::string &filter_name,
+                                              const std::string &marker_name) {
+  auto GetPromotedWriteFilter = [filter_name, marker_name](isl::schedule_node node) -> isl::schedule_node {
     if (!node.isa<isl::schedule_node_filter>()) {
       return node;
     }
     isl::union_set uset = node.as<isl::schedule_node_filter>().get_filter();
-    bool is_gm_write = false;
-    uset.foreach_set([&is_gm_write, write_name](isl::set s) {
-      if (s.get_tuple_name() == write_name) {
-        is_gm_write = true;
+    bool is_gm_filter = false;
+    uset.foreach_set([&is_gm_filter, filter_name](isl::set s) {
+      if (s.get_tuple_name() == filter_name) {
+        is_gm_filter = true;
       }
     });
-    if (is_gm_write && node.has_parent() && node.parent().isa<isl::schedule_node_sequence>()) {
+    if (is_gm_filter && node.has_parent() && node.parent().isa<isl::schedule_node_sequence>()) {
       node = node.child(0).insert_mark(marker_name);
       node = node.parent();
     }
     return node;
   };
-  auto final_sch = sch.get_root().map_descendant_bottom_up(GetPromotedWriteFilter).schedule();
-  return final_sch;
+  return orig_node.map_descendant_bottom_up(GetPromotedWriteFilter);
 }
 
 std::string GetMarkerName(const isl::schedule_node &node, std::string find_name) {
@@ -961,6 +960,43 @@ isl::schedule_node GetMarkerNode(const isl::schedule_node &orig_node, const std:
   return node;
 }
 
+isl::schedule_node DeleUselessMarker(const isl::schedule_node &orig_node,
+                                     const std::unordered_set<std::string> &mark_names) {
+  auto DeleteMarker = [mark_names](isl::schedule_node node) -> isl::schedule_node {
+    if (!node.isa<isl::schedule_node_mark>()) {
+      return node;
+    }
+
+    auto marker_node = node.as<isl::schedule_node_mark>();
+    std::string marker_str = marker_node.get_id().get_name();
+    if (mark_names.find(marker_str) != mark_names.end()) {
+      return node.del();
+    }
+
+    return node;
+  };
+  return orig_node.map_descendant_bottom_up(DeleteMarker);
+}
+
+isl::schedule_node ReplaceMarker(const isl::schedule_node &orig_node, const std::string &orig_name,
+                                 const std::string &replaced_name) {
+  auto DeleteMarker = [orig_name, replaced_name](isl::schedule_node node) -> isl::schedule_node {
+    if (!node.isa<isl::schedule_node_mark>()) {
+      return node;
+    }
+
+    auto marker_node = node.as<isl::schedule_node_mark>();
+    std::string marker_str = marker_node.get_id().get_name();
+    if (marker_str == orig_name) {
+      node = node.del();
+      return node.insert_mark(replaced_name);
+    }
+
+    return node;
+  };
+  return orig_node.map_descendant_bottom_up(DeleteMarker);
+}
+
 }  // namespace poly
 }  // namespace ir
 }  // namespace akg
diff --git a/src/poly/schedule_tree_util.h b/src/poly/schedule_tree_util.h
index 729cec13e56eacbb630d1445f571b88ed67bbc75..f0b9b2302d0ae0d40448fb5e1ea2cb88093500bb 100644
--- a/src/poly/schedule_tree_util.h
+++ b/src/poly/schedule_tree_util.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2021 Huawei Technologies Co., Ltd
+ * Copyright 2020-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,7 +103,7 @@ isl::multi_union_pw_aff MapDomainAllWithType(const isl::schedule_node &node, Map
 isl::map CreateMapIncreaseDim(isl::space space, unsigned dim);
 bool IsSubsetForIncreaseDim(const isl::map access, size_t tensor_dim, size_t node_dim);
 int GetLastAxis(const isl::schedule_node node, isl::union_map original_access,
-                std::unordered_set<std::string> skip_tensors);
+                std::unordered_set<std::string> skip_tensors = {});
 
 std::vector<isl::schedule_node> CollectFnNode(const std::function<bool(const isl::schedule_node &)> &fn,
                                               const isl::schedule_node &root);
@@ -114,8 +114,8 @@ isl::schedule_node UnrollByMarkOptions(isl::schedule_node &node, uint64_t unroll
 isl::map GetExtensionSpace(const isl::schedule_node &node, const isl::id &id);
 isl::schedule_node InsertExtensionNodeBeforeOrAfter(const isl::schedule_node &node, const isl::id &id, bool before);
 
-isl::schedule InsertMarkerForThreadGroup(const isl::schedule &sch, const std::string &write_name,
-                                         const std::string &marker_name);
+isl::schedule_node InsertMarkerForThreadGroup(const isl::schedule_node &orig_node, const std::string &filter_name,
+                                              const std::string &marker_name);
 std::string GetMarkerName(const isl::schedule_node &node, std::string find_name);
 
 isl::union_set GetMappingFilterInfo(const isl::schedule_node node, MappingCfg *mapping_cfg,
@@ -142,6 +142,11 @@ isl::schedule_node CheckMapSizeAndApplyTile(const isl::schedule_node &mapping_ro
                                             const std::vector<int> &additional_tile_size = {});
 isl::multi_union_pw_aff GetMappingPartialSchedule(const isl::schedule_node_band &node, const bool is_promotion = false);
 isl::schedule_node GetMarkerNode(const isl::schedule_node &orig_node, const std::string &marker_name);
+isl::schedule_node DeleUselessMarker(const isl::schedule_node &orig_node,
+                                     const std::unordered_set<std::string> &mark_names);
+
+isl::schedule_node ReplaceMarker(const isl::schedule_node &orig_node, const std::string &orig_name,
+                                 const std::string &replaced_name);
 
 }  // namespace poly
 }  // namespace ir
diff --git a/src/poly/scop_info.cc b/src/poly/scop_info.cc
index 05d65ec0c8c42a04ee7dc83be9036387e6a18e81..f98acbcf71b9d1bf3c0ffd5c4747e6b81ec65e56 100644
--- a/src/poly/scop_info.cc
+++ b/src/poly/scop_info.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2021 Huawei Technologies Co., Ltd
+ * Copyright 2019-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@ namespace akg {
 namespace ir {
 namespace poly {
 constexpr int kInvalidIntAttr = -1;
+constexpr int CONV_INPUT_DIM = 5;
+constexpr int INT_BITS = 32;
 Expr kInvalidExprAttr;
 
 CubeInfo::~CubeInfo() {
@@ -162,12 +164,13 @@ bool ScopInfo::IsElewiseVMStmt(const isl::id &id) const {
 bool ScopInfo::MayWriteAfterRead(const std::string &name) const {
   std::map<int, isl::id> def;
   std::map<int, isl::id> use;
+  const int idstr_size = 2;
   for (auto a : analysis_result_.GetWrites().get_map_list()) {
     isl::id id = a.domain().unwrap().domain().get_tuple_id();
     std::string idstr = id.get_name();
     if (a.get_tuple_id(isl_dim_out).get_name() != name) continue;
-    CHECK_GE(idstr.size(), 2);
-    idstr = idstr.substr(2, idstr.size());
+    CHECK_GE(idstr.size(), idstr_size);
+    idstr = idstr.substr(idstr_size, idstr.size());
     int ref = static_cast<int>(WrappedStrtol(idstr));
     def[ref] = id;
   }
@@ -175,8 +178,8 @@ bool ScopInfo::MayWriteAfterRead(const std::string &name) const {
     isl::id id = a.domain().unwrap().domain().get_tuple_id();
     std::string idstr = id.get_name();
     if (a.get_tuple_id(isl_dim_out).get_name() != name) continue;
-    CHECK_GE(idstr.size(), 2);
-    idstr = idstr.substr(2, idstr.size());
+    CHECK_GE(idstr.size(), idstr_size);
+    idstr = idstr.substr(idstr_size, idstr.size());
     int ref = static_cast<int>(WrappedStrtol(idstr));
     use[ref] = id;
   }
@@ -408,7 +411,7 @@ Type ScopInfo::GetDtypeOf(const std::string &tensor_name) const {
     }
   }
   CHECK(false) << " no such tensor in binds: " << tensor_name;
-  return Int(32);
+  return Int(INT_BITS);
 }
 
 Type ScopInfo::GetDtypeOf(const isl::ast_expr &e) const {
@@ -416,7 +419,7 @@ Type ScopInfo::GetDtypeOf(const isl::ast_expr &e) const {
     isl::id var = op.get_arg(0).as<isl::ast_expr_id>().get_id();
     return GetDtypeOf(var);
   }
-  return Int(32);
+  return Int(INT_BITS);
 }
 
 std::vector<int> ScopInfo::GetShapeOf(const std::string &tensor_name) const {
@@ -625,8 +628,8 @@ void CubeInfo::SetConvMNKInfo() {
   auto conv_mnk_dims = GetConvMNKDims();
   if (user_config_.GetIsDynamic()) {
     for (const auto &dim : conv_mnk_dims) {
-      fractal_int_info_[dim.axis] = IntImm::make(Int(32), dim.c1_tiling_size);
-      attr_info_.Set(dim.axis, IntImm::make(Int(32), dim.c1_tiling_size));
+      fractal_int_info_[dim.axis] = IntImm::make(Int(INT_BITS), dim.c1_tiling_size);
+      attr_info_.Set(dim.axis, IntImm::make(Int(INT_BITS), dim.c1_tiling_size));
     }
   } else {
     const int c0_size = 16;
@@ -651,7 +654,8 @@ void UserConfig::CollectParams() {
         if (imm->value == 1) {
           if (const auto fd = add->a.as<air::ir::FloorDiv>()) {
             if (const auto denominator = fd->b.as<IntImm>()) {
-              if (denominator->value == 2) {
+              const int denominator_num = 2;
+              if (denominator->value == denominator_num) {
                 return CanonicalSimplify(air::ir::Div::make((fd->a + fd->b), fd->b));
               }
             }
@@ -743,6 +747,11 @@ void UserConfig::RegisterParam(const Expr &expr) {
   params_rev_map_.emplace(name, expr);
 }
 
+std::unordered_set<std::string> UserConfig::GetSplitTensors(const std::string &tensor_name) {
+  auto split_tensors = Split(tensor_name, SPACE_PATTERN);
+  return std::unordered_set<std::string>(split_tensors.begin(), split_tensors.end());
+}
+
 MappingCfg *UserConfig::GetThreadConfig() {
   bool enable_replace_cfg = (this->enable_one_dim_thread_ || this->enable_tensor_core_use_poly_);
   if (!enable_replace_cfg) {
@@ -831,7 +840,7 @@ void CubeInfo::UpdateFractalIntFirstInfo(bool is_conv_backprop_filter,
 
 void CubeInfo::UpdateFractalIntLastInfo(std::vector<size_t> filter_fp_cluster_size) {
   if (IsConvBackpropInput()) {
-    CHECK_EQ(filter_fp_cluster_size.size(), 4);
+    CHECK_EQ(filter_fp_cluster_size.size(), CONV_INPUT_DIM - 1);
     // conv_backprop_input filter: [ko, no, ni, ki]
     int64_t kh = ExtractIntFromAttrs(ATTR_CONV_KERNEL_H);
     int64_t kw = ExtractIntFromAttrs(ATTR_CONV_KERNEL_W);
@@ -840,13 +849,13 @@ void CubeInfo::UpdateFractalIntLastInfo(std::vector<size_t> filter_fp_cluster_si
 
     fractal_int_info_[ATTR_CONV_N_INNER] = (int64_t)filter_fp_cluster_size[2];
   } else if (IsConvBackpropFilter()) {
-    CHECK_EQ(filter_fp_cluster_size.size(), 5);
+    CHECK_EQ(filter_fp_cluster_size.size(), CONV_INPUT_DIM);
     // conv_backprop_filter filter: [batch, no, mo, ni, mi]
     fractal_int_info_[ATTR_CONV_TILE_M] = (int64_t)filter_fp_cluster_size[1];
     fractal_int_info_[ATTR_CONV_M_INNER] = (int64_t)filter_fp_cluster_size[3];
     fractal_int_info_[ATTR_CONV_GMM_M] = (int64_t)filter_fp_cluster_size[1] * filter_fp_cluster_size[3];
   } else {
-    CHECK_EQ(filter_fp_cluster_size.size(), 4);
+    CHECK_EQ(filter_fp_cluster_size.size(), CONV_INPUT_DIM - 1);
     // conv_forward filter: [ko, no, ni, ki]
     fractal_int_info_[ATTR_CONV_TILE_CO] = (int64_t)filter_fp_cluster_size[1];
     fractal_int_info_[ATTR_CONV_TILE_N] = (int64_t)filter_fp_cluster_size[1];
@@ -887,25 +896,26 @@ void CubeInfo::UpdateFractalIntFirstInfoConvBackpropFilter(std::vector<size_t> i
 
   fractal_int_info_[ATTR_CONV_TILE_CO] = (int64_t)fractal_fp_cluster_size[conv_tile_co];
 
-  CHECK_EQ(im2col_fp_cluster_size.size(), 6);
+  CHECK_EQ(im2col_fp_cluster_size.size(), CONV_INPUT_DIM + 1);
   fractal_int_info_[ATTR_CONV_GMM_K] = (int64_t)im2col_fp_cluster_size[conv_gmm_k];
 }
 
 void CubeInfo::UpdateFractalIntFirstInfoConvForward(std::vector<size_t> im2col_fp_cluster_size,
                                                     std::vector<size_t> fractal_fp_cluster_size) {
-  CHECK_EQ(fractal_fp_cluster_size.size(), 5);
+  CHECK_EQ(fractal_fp_cluster_size.size(), CONV_INPUT_DIM);
   fractal_int_info_[ATTR_CONV_BATCH] = (int64_t)fractal_fp_cluster_size[0];
   fractal_int_info_[ATTR_CONV_TILE_M] = (int64_t)fractal_fp_cluster_size[1];
   fractal_int_info_[ATTR_CONV_TILE_K] = (int64_t)fractal_fp_cluster_size[2];
   fractal_int_info_[ATTR_CONV_M_INNER] = (int64_t)fractal_fp_cluster_size[3];
   fractal_int_info_[ATTR_CONV_K_INNER] = (int64_t)fractal_fp_cluster_size[4];
 
-  CHECK_EQ(im2col_fp_cluster_size.size(), 6);
+  CHECK_EQ(im2col_fp_cluster_size.size(), CONV_INPUT_DIM + 1);
   fractal_int_info_[ATTR_CONV_GMM_M] = (int64_t)im2col_fp_cluster_size[1];
 }
 
 void CubeInfo::UpdateFractalIntInfoConvForward(int isolate_idx) {
-  auto C0_SIZE = IntImm::make(Int(32), 16);
+  const int c0_size = 16;
+  auto C0_SIZE = IntImm::make(Int(INT_BITS), c0_size);
   fractal_int_info_[ATTR_CONV_TILE_N] = floordiv(model_->get_co_isolate_info(isolate_idx).inner, C0_SIZE);
 
   Expr m = model_->get_h_win_isolate_info(isolate_idx).inner * model_->get_w_win_isolate_info(isolate_idx).inner;
@@ -922,16 +932,16 @@ void CubeInfo::UpdateFractalIntInfoConvForward(int isolate_idx) {
     }
   } else {
     auto tile_h = ExtractExprFromAttrs(ATTR_CONV_TILE_H);
-    tile_h = tile_h.get() ? tile_h : IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_TILE_H));
+    tile_h = tile_h.get() ? tile_h : IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_TILE_H));
     if (!Equal(tile_h, -1)) fractal_int_info_[ATTR_CONV_TILE_H] = tile_h;
     auto tile_w = ExtractExprFromAttrs(ATTR_CONV_TILE_W);
-    tile_w = tile_w.get() ? tile_w : IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_TILE_W));
+    tile_w = tile_w.get() ? tile_w : IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_TILE_W));
     if (!Equal(tile_w, -1)) fractal_int_info_[ATTR_CONV_TILE_W] = tile_w;
 
-    fractal_int_info_[ATTR_CONV_KERNEL_H] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_KERNEL_H));
-    fractal_int_info_[ATTR_CONV_STRIDE_H] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_STRIDE_H));
-    fractal_int_info_[ATTR_CONV_KERNEL_W] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_KERNEL_W));
-    fractal_int_info_[ATTR_CONV_STRIDE_W] = IntImm::make(Int(32), ExtractIntFromAttrs(ATTR_CONV_STRIDE_W));
+    fractal_int_info_[ATTR_CONV_KERNEL_H] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_KERNEL_H));
+    fractal_int_info_[ATTR_CONV_STRIDE_H] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_STRIDE_H));
+    fractal_int_info_[ATTR_CONV_KERNEL_W] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_KERNEL_W));
+    fractal_int_info_[ATTR_CONV_STRIDE_W] = IntImm::make(Int(INT_BITS), ExtractIntFromAttrs(ATTR_CONV_STRIDE_W));
   }
 }
 
@@ -1597,16 +1607,17 @@ std::string TensorMarkTag(MemType mem_type, MemFlow mem_flow) {
    *  Now REALIZE_C1/REALIZE_C0/REALIZE_BUF mark_tag is equal to its MemType.
    *  For mem_type is DDR, mark_tag is empty string "".
    * */
+  const int mem_flow_size = 3;
   switch (mem_type) {
     case MemType::C1_:
-      if (mem_flow.size() == 3 && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::C1_ &&
-          mem_flow[2] == MemType::BUF_C1_)
+      if (mem_flow.size() == mem_flow_size && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::C1_ &&
+          mem_flow[mem_flow_size - 1] == MemType::BUF_C1_)
         return REALIZE_C1BUFC1;
       return REALIZE_C1;
     case MemType::BUF_:
       // ordinary conv condition no fusion
-      if (mem_flow.size() == 3 && mem_flow[0] == MemType::DDR && mem_flow[1] == mem_type &&
-          mem_flow[2] == MemType::C0C_)
+      if (mem_flow.size() == mem_flow_size && mem_flow[0] == MemType::DDR && mem_flow[1] == mem_type &&
+          mem_flow[mem_flow_size - 1] == MemType::C0C_)
         return REALIZE_C0;
       return REALIZE_BUF;
     case MemType::C0A_:
@@ -1618,7 +1629,8 @@ std::string TensorMarkTag(MemType mem_type, MemFlow mem_flow) {
     case MemType::BUF_C0_:
       return REALIZE_BUFC0;
     case MemType::BUF_C1_:
-      if (mem_flow.size() == 2 && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::BUF_C1_) return REALIZE_C1;
+      if (mem_flow.size() == (mem_flow_size - 1) && mem_flow[0] == MemType::DDR && mem_flow[1] == MemType::BUF_C1_)
+        return REALIZE_C1;
       return REALIZE_BUFC1;
     case MemType::DDR:
       return "";
diff --git a/src/poly/scop_info.h b/src/poly/scop_info.h
index 0d9d9760bbb9dac062b7dddc594e3bac2ad2340e..41b9c4d3f584ca499ae4da58d95b36c78257ccb4 100644
--- a/src/poly/scop_info.h
+++ b/src/poly/scop_info.h
@@ -312,8 +312,8 @@ class UserConfig {
       ParseIntAttr(attrs, "csr_thread_num", &csr_thread_num_);
       ParseIntAttr(attrs, "csr_avg_row", &csr_avg_row_);
       ParseStringAttr(attrs, "shared_memory_tensors", &shared_tensors_);
+      ParseStringAttr(attrs, "register_memory_tensors", &register_tensors_);
       ParseStringAttr(attrs, "reduce_lib_type", &reduce_lib_type_);
-      ParseStringAttr(attrs, "local_memory_tensors", &local_tensors_);
       ParseVectorLengthAttr(attrs, "vector_length", &vector_length_);
     } else if (GetTarget() == TARGET_CPU) {
       ParseVectorLengthAttr(attrs, "vector_length", &vector_length_, false);
@@ -542,10 +542,13 @@ class UserConfig {
   bool GetUseSharedMemory() const { return use_shared_memory_; }
   void SetGetUseSharedMemory(bool use_shared_memory) { use_shared_memory_ = use_shared_memory; }
   void SetGetUseRegisterMemory(bool use_register_memory) { use_register_memory_ = use_register_memory; }
-  void SetSharedTensors(std::string shared_tensors) { shared_tensors_ = shared_tensors; }
-  std::string GetSharedTensors() { return shared_tensors_; }
+
+  std::unordered_set<std::string> GetSplitTensors(const std::string &tensor_name);
+  void RecordSharedTensors(const std::string &tensor_name) { shared_tensors_ += (SPACE_PATTERN + tensor_name); }
+  std::unordered_set<std::string> GetSharedTensors() { return GetSplitTensors(shared_tensors_); }
+  std::unordered_set<std::string> GetRegisterTensors() { return GetSplitTensors(register_tensors_); }
+
   std::string GetReduceLibType() { return reduce_lib_type_; }
-  std::string GetLocalTensors() { return local_tensors_; }
   void SetEnableBankConflict(bool enable_bank_conflict) { enable_bank_conflict_ = enable_bank_conflict; }
   bool GetEnableBankConflict() { return enable_bank_conflict_; }
   int GetVectorLength() { return vector_length_; }
@@ -709,16 +712,16 @@ class UserConfig {
   bool use_shared_memory_{true};
   // shared memory tensor list
   std::string shared_tensors_;
+  // local memory tensor list
+  std::string register_tensors_;
   // reduce lib type, for now, there are two selection
   // one is named "origin"
   // one is named "paris"
   std::string reduce_lib_type_{"origin"};
-  // local memory tensor list
-  std::string local_tensors_;
   // vectorization
   int vector_length_{0};
   bool enable_one_dim_thread_{false};
-  bool enable_vectorization_{false};
+  bool enable_vectorization_{true};
 
   // tiling config
   std::string b_dim_;
@@ -938,6 +941,7 @@ class AnalysisResult {
     bool enable_vectorization{false};
     bool is_thread_tile{false};
     bool is_block_tile{false};
+    std::set<std::string> coalesced_access_tensors;
   };
 
   void RecordWrites(const isl::union_map &writes) { writes_ = writes; }
diff --git a/src/poly/scop_make_schedule_tree.cc b/src/poly/scop_make_schedule_tree.cc
index f53125cf2aef87162825c9e6ef819f64795ad2a0..40eef33202a3f246cfa5ced8ba312e673a5d707f 100644
--- a/src/poly/scop_make_schedule_tree.cc
+++ b/src/poly/scop_make_schedule_tree.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021 Huawei Technologies Co., Ltd
+ * Copyright 2021-2022 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/poly/tiling/tiling_strategy_manager_gpu.cc b/src/poly/tiling/tiling_strategy_manager_gpu.cc
index 74c0cd20854ee35a7eb151b18bb5a2c98e2f9884..b5679d39d382dbe5b953818c177d13b6c8bc8f80 100644
--- a/src/poly/tiling/tiling_strategy_manager_gpu.cc
+++ b/src/poly/tiling/tiling_strategy_manager_gpu.cc
@@ -924,10 +924,12 @@ void GpuStrategy::AddGpuConstraint() {
     }
 
     is_first = false;
-  }
 
-  if (!((template_ == Template::MATMUL || template_ == Template::CONV) &&
-        analyzer_->scop_info_.user_config_.GetEnableTensorCore())) {
+    if ((template_ == Template::MATMUL || template_ == Template::CONV) &&
+        analyzer_->scop_info_.user_config_.GetEnableTensorCore()) {
+      continue;
+    }
+
     analyzer_->ForEachAxisTopDown([this](TileAxis *axis) {
       if (axis == analyzer_->RootAxis()) {
         return;
@@ -980,6 +982,9 @@ void GpuStrategy::VectorizationSpeedup() {
 }
 
 bool GpuStrategy::IsVectorized() {
+  if (!analyzer_->scop_info_.user_config_.GetEnableVectorization()) {
+    return false;
+  }
   auto reads_access = analyzer_->scop_info_.analysis_result_.GetReads().domain_factor_domain();
   auto write_access = analyzer_->scop_info_.analysis_result_.GetWrites().domain_factor_domain();
   auto original_access = reads_access.unite(write_access);
@@ -1058,7 +1063,6 @@ void GpuStrategy::CheckVectorizationForElemwiseOp() {
     analyzer_->scop_info_.user_config_.SetVectorLength(quadruple_warp_size_);
   }
   current_outer_bn_->enable_vectorization = true;
-  analyzer_->scop_info_.user_config_.SetEnableVectorization(true);
 }
 
 void GpuStrategy::ThreadConfiguration(ReduceDirection direct, bool use_lib) {
diff --git a/tests/st/ops/gpu/test_mindtricks.py b/tests/st/ops/gpu/test_mindtricks.py
index e0415684d737851d004d2b4d96dd0c312296d2b6..db9a94ac57691ebbaa3d53ab435967dbccd033ab 100644
--- a/tests/st/ops/gpu/test_mindtricks.py
+++ b/tests/st/ops/gpu/test_mindtricks.py
@@ -62,9 +62,10 @@ composite_targets = {
         "Fused_AddN_fusion_9584919353229493170",
         "Fused_Cast_BiasAdd_Gelu_fusion_7719078727474100806",
         "Fused_Cast_BiasAdd_GkDropout_tuple_getitem_TensorAdd_fusion_13282325956852925231",
-        "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1039082044534023692",
-        "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1545859458890067484",
-        "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1976850843332086880",
+        # Note: swizzle pass requires constant tensor must be promoted.
+        # "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1039082044534023692",
+        # "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1545859458890067484",
+        # "Fused_Cast_RealDiv_Reshape_FusedAdamWeightDecay_fusion_1976850843332086880",
         "Fused_GkDropout_2353362030752466006",
         "Fused_Transpose_split_18185609042134105765",
     ],