From 141463cde2943d82543b612307527840f4adaad2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=AD=99=E6=98=8A=E8=BE=B0?= <sunhaochen3@huawei.com>
Date: Tue, 16 Sep 2025 20:53:11 +0800
Subject: [PATCH] paged_cache_load supported in internal custom_ops, not using
 atb

---
 .jenkins/test/config/dependent_packages.yaml  |   2 +-
 README.md                                     |   2 +-
 .../paged_cache_load_common.h                 |   8 +-
 .../paged_cache_load_graph.cc                 |  69 +++++-
 .../paged_cache_load_pynative.cc              |  63 ++++-
 ...oad.py => test_custom_paged_cache_load.py} | 223 +++++++++++-------
 yaml/doc/paged_cache_load_doc.yaml            |  33 +--
 .../paged_cache_load_op.yaml                  |  10 -
 8 files changed, 270 insertions(+), 140 deletions(-)
 rename tests/st/{test_asd_paged_cache_load.py => test_custom_paged_cache_load.py} (64%)
diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index dd9a56f9c..9b6f8bd50 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,2 +1,2 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202509/20250923/master_20250923144134_9319c228c9a369b583781cf172e94f3b5bd43fa8_newest/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202509/20250923/master_20250923160018_7977cb0fe4b16b880c7f783ac1eb398af3f400e8_newest/'
diff --git a/README.md b/README.md
index a4d9a8fb5..764d1bb44 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ output = ms_custom_ops.reshape_and_cache(
 import mindspore as ms
 import ms_custom_ops
 
-reshape_and_cache = ms.jit(func=ms_custom_ops.reshape_and_cache)
+reshape_and_cache = ms.jit(ms_custom_ops.reshape_and_cache)
 output = reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, head_num)
 ```
 
diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h
index fb1dfc5eb..faae8b53e 100644
--- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h
+++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h
@@ -25,8 +25,6 @@ enum PagedCacheLoadInputIndex : size_t {
   kPCLInputValueCacheIndex,
   kPCLInputBlockTableIndex,
   kPCLInputSeqLensIndex,
-  kPCLInputKeyIndex,
-  kPCLInputValueIndex,
   kPCLInputSeqStartsIndex,
   kPCLInputParamKvCacheCfgIndex,
   kPCLInputParamIsSeqLensCumsumTypeIndex,
@@ -40,6 +38,12 @@ enum PagedCacheLoadOutputIndex : size_t {
   kPCLOutputsNum
 };
 
+inline constexpr int64_t kNumHeadsIndex = 2;
+inline constexpr int64_t kHeadSizeIndex = 3;
+inline constexpr int64_t kNdFormatType = 0;
+inline constexpr int64_t kNzFormatType = 1;
+inline constexpr int64_t kNumHeadsMulHeadSizeIndex = 2;
+
 inline internal::InternalOpPtr CreatePagedCacheLoadOpWithFormat(const internal::InputsImmutableInfoList &inputs,
                                                                 const internal::OutputsImmutableInfoList &outputs,
                                                                 const internal::PagedCacheLoadParam &param) {
diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc
index c4fb46ff7..7eefd06b1 100644
--- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc
+++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_graph.cc
@@ -16,15 +16,67 @@
 
 #include "ccsrc/base/ms_kernels_internal/graphmode/internal_kernel_mod.h"
 #include "ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h"
+#include "mindspore/ops/ops_utils/op_utils.h"
+#include "mindspore/core/include/ops/ops_func_impl/op_func_impl.h"
+#include <vector>
 
 namespace ms_custom_ops {
 class OPS_API CustomPagedCacheLoadOpFuncImpl : public OpFuncImpl {
 public:
   ShapeArray InferShape(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const override {
-    return {input_infos[kPCLInputKeyIndex]->GetShape(), input_infos[kPCLInputValueIndex]->GetShape()};
+    int64_t format_type = input_infos[kPCLInputParamKvCacheCfgIndex]->GetScalarValueWithCheck<int64_t>();
+    bool seq_lens_consum_type = input_infos[kPCLInputParamIsSeqLensCumsumTypeIndex]->GetScalarValueWithCheck<bool>();
+    int64_t sum_context_lens = abstract::Shape::kShapeDimAny;
+
+    if (seq_lens_consum_type) {
+      if (input_infos[kPCLInputSeqLensIndex]->GetType() == mindspore::TypeId::kNumberTypeInt64) {
+        auto context_lens_tensor = input_infos[kPCLInputSeqLensIndex]->GetArrayValue<int64_t>();
+        if (context_lens_tensor.has_value()) {
+          auto context_lens_vector = context_lens_tensor.value().ToVector();
+          sum_context_lens = context_lens_vector.back();
+        }
+      } else {
+        auto context_lens_tensor = input_infos[kPCLInputSeqLensIndex]->GetArrayValue<int32_t>();
+        if (context_lens_tensor.has_value()) {
+          auto context_lens_vector = context_lens_tensor.value().ToVector();
+          sum_context_lens = context_lens_vector.back();
+        }
+      }
+    } else {
+      if (input_infos[kPCLInputSeqLensIndex]->GetType() == mindspore::TypeId::kNumberTypeInt64) {
+        auto context_lens_tensor = input_infos[kPCLInputSeqLensIndex]->GetArrayValue<int64_t>();
+        if (context_lens_tensor.has_value()) {
+          auto context_lens_vector = context_lens_tensor.value().ToVector();
+          sum_context_lens = std::accumulate(context_lens_vector.begin(), context_lens_vector.end(), 0);
+        }
+      } else {
+        auto context_lens_tensor = input_infos[kPCLInputSeqLensIndex]->GetArrayValue<int32_t>();
+        if (context_lens_tensor.has_value()) {
+          auto context_lens_vector = context_lens_tensor.value().ToVector();
+          sum_context_lens = std::accumulate(context_lens_vector.begin(), context_lens_vector.end(), 0);
+        }
+      }
+    }
+
+    ShapeVector key_out_shape{};
+    ShapeVector value_out_shape{};
+    if (format_type == kNdFormatType) {  // ND
+      int64_t num_heads = input_infos[kPCLInputKeyCacheIndex]->GetShape()[kNumHeadsIndex];
+      int64_t head_size_k = input_infos[kPCLInputKeyCacheIndex]->GetShape()[kHeadSizeIndex];
+      int64_t head_size_v = input_infos[kPCLInputValueCacheIndex]->GetShape()[kHeadSizeIndex];
+      key_out_shape = {sum_context_lens, num_heads, head_size_k};
+      value_out_shape = {sum_context_lens, num_heads, head_size_v};
+    } else {  // NZ
+      int64_t num_heads_mul_head_size_k = input_infos[kPCLInputKeyCacheIndex]->GetShape()[kNumHeadsMulHeadSizeIndex];
+      int64_t num_heads_mul_head_size_v = input_infos[kPCLInputValueCacheIndex]->GetShape()[kNumHeadsMulHeadSizeIndex];
+      key_out_shape = {sum_context_lens, num_heads_mul_head_size_k};
+      value_out_shape = {sum_context_lens, num_heads_mul_head_size_v};
+    }
+    return {key_out_shape, value_out_shape};
   }
+
   std::vector<TypeId> InferType(const PrimitivePtr &primitive, const InferInfoPtrList &input_infos) const override {
-    return {{input_infos[kPCLInputKeyIndex]->GetType(), input_infos[kPCLInputValueIndex]->GetType()}};
+    return {{input_infos[kPCLInputKeyCacheIndex]->GetType(), input_infos[kPCLInputValueCacheIndex]->GetType()}};
   }
 
   bool GeneralInferRegistered() const override { return true; }
@@ -37,7 +89,7 @@ public:
 
   void InitKernelInputsOutputsIndex() override {
     kernel_inputs_index_ = {kPCLInputKeyCacheIndex, kPCLInputValueCacheIndex, kPCLInputBlockTableIndex,
-                            kPCLInputSeqLensIndex, kPCLInputKeyIndex, kPCLInputValueIndex, kPCLInputSeqStartsIndex};
+                            kPCLInputSeqLensIndex, kPCLInputSeqStartsIndex};
     kernel_outputs_index_ = {kPCLOutputKeyOutIndex, kPCLOutputValueOutIndex};
   }
 
@@ -90,6 +142,17 @@ protected:
     param.kv_cache_cfg_type = kv_cache_cfg_type->GetValue<int64_t>().value();
     param.is_seq_lens_cumsum_type = is_seq_lens_cumsum_type->GetValue<bool>().value();
     param.has_seq_starts = has_seq_starts->GetValue<bool>().value();
+    
+    auto context_lens_tensor = ms_inputs.at(kPCLInputSeqLensIndex);
+    auto context_lens_value = context_lens_tensor->GetValueWithCheck<std::vector<int32_t>>();
+    auto sum_context_lens = 0;
+    if (is_seq_lens_cumsum_type) {
+      sum_context_lens = context_lens_value.back();
+    } else {
+      sum_context_lens = std::accumulate(context_lens_value.begin(), context_lens_value.end(), 0);
+    }
+    param.sum_context_lens = sum_context_lens;
+    MS_LOG(INFO) << "param.sum_context_lens is " << param.sum_context_lens;
     return CreatePagedCacheLoadOpWithFormat(inputs, outputs, param);
   }
 
diff --git a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc
index d35081470..23b9f21cb 100644
--- a/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc
+++ b/ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_pynative.cc
@@ -21,6 +21,7 @@
 #include "ccsrc/base/ms_kernels_internal/pyboost/internal_pyboost_runner.h"
 #include "ccsrc/ops/ms_kernels_internal/paged_cache_load/paged_cache_load_common.h"
 #include "ccsrc/utils/utils.h"
+#include <vector>
 
 namespace ms_custom_ops {
 class PagedCacheLoadRunner : public InternalPyboostRunner {
@@ -49,8 +50,6 @@ std::vector<ms::Tensor> npu_paged_cache_load(const ms::Tensor &key_cache,
                                              const ms::Tensor &value_cache,
                                              const ms::Tensor &block_table,
                                              const ms::Tensor &seq_lens,
-                                             const ms::Tensor &key,
-                                             const ms::Tensor &value,
                                              const std::optional<ms::Tensor> &seq_starts,
                                              std::optional<int64_t> kv_cache_cfg,
                                              std::optional<bool> is_seq_lens_cumsum_type,
@@ -73,12 +72,59 @@ std::vector<ms::Tensor> npu_paged_cache_load(const ms::Tensor &key_cache,
   runner->param_.is_seq_lens_cumsum_type = is_seq_lens_cumsum_type.value();
   runner->param_.has_seq_starts = has_seq_starts.value();
 
+  int64_t sum_context_lens = abstract::Shape::kShapeDimAny;
+
+  if (seq_lens.GetDataPtr() != nullptr) {
+    if (is_seq_lens_cumsum_type.value()) {
+      if (seq_lens.data_type() == mindspore::TypeId::kNumberTypeInt64) {
+        int64_t * seq_lens_ptr = static_cast<int64_t *>(seq_lens.GetDataPtr());
+        for (size_t i = 0; i < seq_lens.numel(); i ++) {
+          sum_context_lens = seq_lens_ptr[seq_lens.numel() - 1];
+        }
+      } else {
+        int32_t * seq_lens_ptr = static_cast<int32_t *>(seq_lens.GetDataPtr());
+        for (size_t i = 0; i < seq_lens.numel(); i ++) {
+          sum_context_lens = seq_lens_ptr[seq_lens.numel() - 1];
+        }
+      }
+    } else {
+      sum_context_lens = 0;
+      if (seq_lens.data_type() == mindspore::TypeId::kNumberTypeInt64) {
+        int64_t * seq_lens_ptr = static_cast<int64_t *>(seq_lens.GetDataPtr());
+        for (size_t i = 0; i < seq_lens.numel(); i ++) {
+          sum_context_lens += seq_lens_ptr[i];
+        }
+      } else {
+        int32_t * seq_lens_ptr = static_cast<int32_t *>(seq_lens.GetDataPtr());
+        for (size_t i = 0; i < seq_lens.numel(); i ++) {
+          sum_context_lens += seq_lens_ptr[i];
+        }
+      }
+    }
+  }
+  runner->param_.sum_context_lens = sum_context_lens;
   // Setup the runner with all parameters (including hash calculation)
-  runner->Setup(op_name, key_cache, value_cache, block_table, seq_lens, key, value, seq_starts, kv_cache_cfg,
+  runner->Setup(op_name, key_cache, value_cache, block_table, seq_lens, seq_starts, kv_cache_cfg,
                 is_seq_lens_cumsum_type, has_seq_starts);
-  std::vector<ms::Tensor> inputs = {key_cache, value_cache, block_table, seq_lens, key, value,
-                                    GetTensorOrEmpty(seq_starts)};
-  std::vector<ms::Tensor> outputs = {key, value};
+  std::vector<ms::Tensor> inputs = {key_cache, value_cache, block_table, seq_lens, GetTensorOrEmpty(seq_starts)};
+
+  ShapeVector key_out_shape{};
+  ShapeVector value_out_shape{};
+  if (kv_cache_cfg == kNdFormatType) {  // ND
+    int64_t num_heads = key_cache.shape()[kNumHeadsIndex];
+    int64_t head_size_k = key_cache.shape()[kHeadSizeIndex];
+    int64_t head_size_v = value_cache.shape()[kHeadSizeIndex];
+    key_out_shape = {sum_context_lens, num_heads, head_size_k};
+    value_out_shape = {sum_context_lens, num_heads, head_size_v};
+  } else {  // NZ
+    int64_t num_heads_mul_head_size_k = key_cache.shape()[kNumHeadsMulHeadSizeIndex];
+    int64_t num_heads_mul_head_size_v = value_cache.shape()[kNumHeadsMulHeadSizeIndex];
+    key_out_shape = {sum_context_lens, num_heads_mul_head_size_k};
+    value_out_shape = {sum_context_lens, num_heads_mul_head_size_v};
+  }
+  auto key_out = ms::Tensor(key_cache.data_type(), key_out_shape);
+  auto value_out = ms::Tensor(value_cache.data_type(), value_out_shape);
+  std::vector<ms::Tensor> outputs = {key_out, value_out};
   runner->GetOrCreateKernel(inputs, outputs);
   runner->Run(inputs, outputs);
   return outputs;
@@ -88,14 +134,12 @@ auto pyboost_paged_cache_load(const ms::Tensor &key_cache,
                               const ms::Tensor &value_cache,
                               const ms::Tensor &block_table,
                               const ms::Tensor &seq_lens,
-                              const ms::Tensor &key,
-                              const ms::Tensor &value,
                               const std::optional<ms::Tensor> &seq_starts,
                               std::optional<int64_t> kv_cache_cfg,
                               std::optional<bool> is_seq_lens_cumsum_type,
                               std::optional<bool> has_seq_starts) {
   return ms::pynative::PyboostRunner::Call<2>(
-      npu_paged_cache_load, key_cache, value_cache, block_table, seq_lens, key, value, seq_starts,
+      npu_paged_cache_load, key_cache, value_cache, block_table, seq_lens, seq_starts,
       kv_cache_cfg, is_seq_lens_cumsum_type, has_seq_starts);
 }
 } // namespace ms_custom_ops
@@ -104,7 +148,6 @@ MS_CUSTOM_OPS_EXTENSION_MODULE(m) {
   m.def("paged_cache_load", &ms_custom_ops::pyboost_paged_cache_load, "Paged Cache Load",
     pybind11::arg("key_cache"), pybind11::arg("value_cache"),
     pybind11::arg("block_table"), pybind11::arg("seq_lens"),
-    pybind11::arg("key"), pybind11::arg("value"),
     pybind11::arg("seq_starts") = std::nullopt,
     pybind11::arg("kv_cache_cfg") = std::nullopt,
     pybind11::arg("is_seq_lens_cumsum_type") = std::nullopt,
diff --git a/tests/st/test_asd_paged_cache_load.py b/tests/st/test_custom_paged_cache_load.py
similarity index 64%
rename from tests/st/test_asd_paged_cache_load.py
rename to tests/st/test_custom_paged_cache_load.py
index 48619804a..09a81be83 100644
--- a/tests/st/test_asd_paged_cache_load.py
+++ b/tests/st/test_custom_paged_cache_load.py
@@ -25,33 +25,38 @@ class AsdPagedCacheLoadCustom(ms.nn.Cell):
     def __init__(self):
         super().__init__()
 
-    @jit
-    def construct(self, key_cache, value_cache, block_table, seq_lens, key, value, seq_starts, kv_cache_cfg,
+    def construct(self, key_cache, value_cache, block_table, seq_lens, seq_starts, kv_cache_cfg,
                   is_seq_lens_cumsum_type, has_seq_starts):
-        return ms_custom_ops.paged_cache_load(key_cache, value_cache, block_table, seq_lens, key, value,
-                                              seq_starts, kv_cache_cfg, is_seq_lens_cumsum_type,
-                                              has_seq_starts)
+        return ms_custom_ops.paged_cache_load(key_cache, value_cache, block_table, seq_lens, seq_starts, kv_cache_cfg,
+                                              is_seq_lens_cumsum_type, has_seq_starts)
     
 def golden_calc_nd(num_tokens, num_heads, head_size_k, head_size_v, block_size, block_tables, context_lens,
-                   seq_starts, key_cache, value_cache, dtype):
-    sum_context_lens = context_lens[-1]
-    if dtype == ms.float16:
+                   seq_starts, key_cache, value_cache, dtype1, dtype2, sum_context_lens):
+    if dtype1 == ms.float16:
         key_expect = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(np.float16)
-        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float16)
-    elif dtype == ms.bfloat16:
+    elif dtype1 == ms.bfloat16:
         key_expect = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(np.float32)
-        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float32)
     else:
         key_expect = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(np.int8)
+    if dtype2 == ms.float16:
+        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float16)
+    elif dtype2 == ms.bfloat16:
+        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float32)
+    else:
         value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.int8)
+
     kv_rslt_id = 0
     context_start = 0
     for i in range(num_tokens):
         block_table = block_tables[i]
-        context_end = int(context_lens[i + 1])
-        context_len = context_end - context_start
-        context_start = context_end
-        block_table_offset = seq_starts[i] // block_size
+        if seq_starts is None:
+            context_len = int(context_lens[i])
+            block_table_offset = 0
+        else:
+            context_end = int(context_lens[i + 1])
+            context_len = context_end - context_start
+            context_start = context_end
+            block_table_offset = seq_starts[i] // block_size
         for j in range(context_len):
             block_id = int(block_table[block_table_offset + j // block_size])
             block_offset = j % block_size
@@ -65,16 +70,18 @@ def golden_calc_nd(num_tokens, num_heads, head_size_k, head_size_v, block_size,
     return key_expect, value_expect
 
 def golden_calc_nz(num_tokens, num_heads, head_size_k, head_size_v, block_size, block_tables, context_lens,
-                   key_cache, value_cache, dtype):
-    sum_context_lens = sum(context_lens)
-    if dtype == ms.float16:
+                   key_cache, value_cache, dtype1, dtype2, sum_context_lens):
+    if dtype1 == ms.float16:
         key_expect = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(np.float16)
-        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float16)
-    elif dtype == ms.bfloat16:
+    elif dtype1 == ms.bfloat16:
         key_expect = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(np.float32)
-        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float32)
     else:
         key_expect = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(np.int8)
+    if dtype2 == ms.float16:
+        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float16)
+    elif dtype2 == ms.bfloat16:
+        value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.float32)
+    else:
         value_expect = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(np.int8)
 
     kv_rslt_id = 0    
@@ -94,62 +101,80 @@ def golden_calc_nz(num_tokens, num_heads, head_size_k, head_size_v, block_size,
     return (key_expect.reshape(sum_context_lens, num_heads * head_size_k),
             value_expect.reshape(sum_context_lens, num_heads * head_size_v))
 
-def generate_data_nd(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype):
-    if dtype == ms.float16:
+def generate_data_nd(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2,
+                     cu_seq_lens):
+    if dtype1 == ms.float16:
         key_cache = np.random.randint(1, 11, 
                                       size=(num_blocks, block_size, num_heads, head_size_k)).astype(np.float16)
-        value_cache = np.random.randint(1, 11, 
-                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float16)
-    elif dtype == ms.bfloat16:
+    elif dtype1 == ms.bfloat16:
         key_cache = np.random.randint(1, 11, 
                                       size=(num_blocks, block_size, num_heads, head_size_k)).astype(np.float32)
-        value_cache = np.random.randint(1, 11, 
-                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float32)
     else:
         key_cache = np.random.randint(1, 11, 
                                       size=(num_blocks, block_size, num_heads, head_size_k)).astype(np.int8)
+    if dtype2 == ms.float16:
+        value_cache = np.random.randint(1, 11, 
+                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float16)
+    elif dtype2 == ms.bfloat16:
+        value_cache = np.random.randint(1, 11, 
+                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float32)
+    else:
         value_cache = np.random.randint(1, 11, 
                                         size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.int8)
+
     context_lens = [random.randint(1, 1024) for _ in range(num_tokens)]
     max_context_len = max(context_lens)
     max_num_blocks_per_req = (max_context_len + block_size -1) // block_size + 4
     block_tables = []
-    for _ in range(num_tokens):
-        block_table = [
-            random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_req)
-        ]
-        block_tables.append(block_table)
-    cu_context_lens = [0]
-    for elem in context_lens:
-        cu_context_lens.append(cu_context_lens[-1] + elem)
-    seq_starts = [random.randint(0, 4) * block_size for _ in range(num_tokens)]
-    context_lens = np.array(cu_context_lens).astype(np.int32)
-    block_tables = np.array(block_tables).astype(np.int32)
-    seq_starts = np.array(seq_starts).astype(np.int32)
-    sum_context_lens = context_lens[-1]
-    key = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(key_cache.dtype)
-    value = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(value_cache.dtype)
-    key_tensor = Tensor(key).astype(dtype)
-    value_tensor = Tensor(value).astype(dtype)
 
-    return key_cache, value_cache, block_tables, context_lens, key_tensor, value_tensor, seq_starts
+    if cu_seq_lens:
+        for _ in range(num_tokens):
+            block_table = [
+                random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_req)
+            ]
+            block_tables.append(block_table)
+        cu_context_lens = [0]
+        for elem in context_lens:
+            cu_context_lens.append(cu_context_lens[-1] + elem)
+        seq_starts = [random.randint(0, 4) * block_size for _ in range(num_tokens)]
+        context_lens = np.array(cu_context_lens).astype(np.int32)
+        block_tables = np.array(block_tables).astype(np.int32)
+        seq_starts = np.array(seq_starts).astype(np.int32)
+        sum_context_lens = context_lens[-1]
+    
+        return key_cache, value_cache, block_tables, context_lens, seq_starts, sum_context_lens
+    else:
+        for _ in range(num_tokens):
+            block_table = [
+                random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_req)
+            ]
+            block_tables.append(block_table)
+    
+        context_lens = np.array(context_lens).astype(np.int32)
+        block_tables = np.array(block_tables).astype(np.int32)
+        sum_context_lens = sum(context_lens)
+        return key_cache, value_cache, block_tables, context_lens, None, sum_context_lens
 
-def generate_data_nz(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype):
-    if dtype == ms.float16:
+def generate_data_nz(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2):
+    if dtype1 == ms.float16:
         key_cache = np.random.randint(1, 11, 
                                       size=(num_blocks, block_size, num_heads, head_size_k)).astype(np.float16)
-        value_cache = np.random.randint(1, 11, 
-                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float16)
-    elif dtype == ms.bfloat16:
+    elif dtype1 == ms.bfloat16:
         key_cache = np.random.randint(1, 11, 
                                       size=(num_blocks, block_size, num_heads, head_size_k)).astype(np.float32)
-        value_cache = np.random.randint(1, 11, 
-                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float32)
     else:
         key_cache = np.random.randint(1, 11, 
                                       size=(num_blocks, block_size, num_heads, head_size_k)).astype(np.int8)
+    if dtype2 == ms.float16:
+        value_cache = np.random.randint(1, 11, 
+                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float16)
+    elif dtype2 == ms.bfloat16:
+        value_cache = np.random.randint(1, 11, 
+                                        size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.float32)
+    else:
         value_cache = np.random.randint(1, 11, 
                                         size=(num_blocks, block_size, num_heads, head_size_v)).astype(np.int8)
+
     context_lens = [random.randint(1, 1024) for _ in range(num_tokens)]
     max_context_len = max(context_lens)
     max_num_blocks_per_req = (max_context_len + block_size -1) // block_size
@@ -163,70 +188,68 @@ def generate_data_nz(num_tokens, num_heads, head_size_k, head_size_v, block_size
     context_lens = np.array(context_lens).astype(np.int32)
     block_tables = np.array(block_tables).astype(np.int32)
     sum_context_lens = sum(context_lens)
-    key = np.zeros((sum_context_lens, num_heads * head_size_k)).astype(key_cache.dtype)
-    value = np.zeros((sum_context_lens, num_heads * head_size_v)).astype(value_cache.dtype)
-    key_tensor = Tensor(key).astype(dtype)
-    value_tensor = Tensor(value).astype(dtype)
 
-    return key_cache, value_cache, block_tables, context_lens, key_tensor, value_tensor, None
+    return key_cache, value_cache, block_tables, context_lens, None, sum_context_lens
 
-def paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype,
+def paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2,
                               format_type, cu_seq_lens, has_seq_starts):
     if format_type == 0:
-        key_cache, value_cache, block_tables, context_lens, key_tensor, value_tensor, seq_starts = (
+        key_cache, value_cache, block_tables, context_lens, seq_starts, sum_context_lens = (
             generate_data_nd(
-                num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype
+                num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2, cu_seq_lens
             )
         )
         key_golden, value_golden = golden_calc_nd(num_tokens, num_heads, head_size_k, head_size_v, block_size,
                                                   block_tables, context_lens, seq_starts, key_cache, value_cache,
-                                                  dtype)
-        key_cache_tensor = Tensor(key_cache).astype(dtype)
-        value_cache_tensor = Tensor(value_cache).astype(dtype)
+                                                  dtype1, dtype2, int(sum_context_lens))
+        key_cache_tensor = Tensor(key_cache).astype(dtype1)
+        value_cache_tensor = Tensor(value_cache).astype(dtype2)
     else:
-        key_cache, value_cache, block_tables, context_lens, key_tensor, value_tensor, seq_starts = (
+        key_cache, value_cache, block_tables, context_lens, seq_starts, sum_context_lens = (
             generate_data_nz(
-                num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype
+                num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2
             )
         )
         key_golden, value_golden = golden_calc_nz(num_tokens, num_heads, head_size_k, head_size_v, block_size,
-                                                  block_tables, context_lens, key_cache, value_cache, dtype)
+                                                  block_tables, context_lens, key_cache, value_cache, dtype1, dtype2,
+                                                  int(sum_context_lens))
         key_cache = key_cache.reshape(num_blocks, block_size, -1)
         value_cache = value_cache.reshape(num_blocks, block_size, -1)
-        key_cache_tensor = ms_custom_ops.trans_data(Tensor(key_cache).astype(dtype), transdata_type=1) # ND_TO_FRACTAL_NZ
-        value_cache_tensor = ms_custom_ops.trans_data(Tensor(value_cache).astype(dtype), transdata_type=1) # ND_TO_FRACTAL_NZ
-
+        key_cache_tensor = ms_custom_ops.trans_data(Tensor(key_cache).astype(dtype1), transdata_type=1) # ND_TO_FRACTAL_NZ
+        value_cache_tensor = ms_custom_ops.trans_data(Tensor(value_cache).astype(dtype2), transdata_type=1) # ND_TO_FRACTAL_NZ
     seq_starts_tensor = None if seq_starts is None else Tensor(seq_starts)
+
     net = AsdPagedCacheLoadCustom()
     key_out, value_out = net(
         key_cache_tensor,
         value_cache_tensor,
         Tensor(block_tables),
         Tensor(context_lens),
-        key_tensor,
-        value_tensor,
         seq_starts_tensor,
         format_type, cu_seq_lens, has_seq_starts
     )
-    if dtype == ms.bfloat16:
+    if dtype1 == ms.bfloat16:
         key_out_np = key_out.astype(ms.float32).asnumpy()
-        value_out_np = value_out.astype(ms.float32).asnumpy()
     else:
         key_out_np = key_out.asnumpy()
+    if dtype2 == ms.bfloat16:
+        value_out_np = value_out.astype(ms.float32).asnumpy()
+    else:
         value_out_np = value_out.asnumpy()
-    key_out_compare = custom_compare(key_out_np, key_golden, dtype)
+
+    key_out_compare = custom_compare(key_out_np, key_golden, dtype1)
     assert key_out_compare, "key_out compare failed"
-    value_out_compare = custom_compare(value_out_np, value_golden, dtype)
+    value_out_compare = custom_compare(value_out_np, value_golden, dtype2)
     assert value_out_compare, "value_out compare failed"
 
 @pytest.mark.level0
 @pytest.mark.platform_ascend910b
 @pytest.mark.env_onecard
-@pytest.mark.parametrize('dtype', [ms.float16, ms.int8, ms.bfloat16])
+@pytest.mark.parametrize('dtype1', [ms.float16, ms.int8, ms.bfloat16])
+@pytest.mark.parametrize('dtype2', [ms.float16, ms.int8, ms.bfloat16])
 @pytest.mark.parametrize('context_mode', [context.GRAPH_MODE, context.PYNATIVE_MODE])
-@pytest.mark.parametrize('input_param', [[128, 128, 16, 144, 128, 16, 1],
-                                         [256, 64, 16, 192, 128, 32, 1]])
-def test_paged_cache_load_nd_with_seq_starts(dtype, context_mode, input_param):
+@pytest.mark.parametrize('input_param', [[128, 128, 16, 144, 128, 16, 1]])
+def test_paged_cache_load_nd_with_seq_starts(dtype1, dtype2, context_mode, input_param):
     """
     Feature: test paged_cache_load operator
     Description: test paged_cache_load
@@ -236,21 +259,44 @@ def test_paged_cache_load_nd_with_seq_starts(dtype, context_mode, input_param):
     context.set_context(jit_config={"jit_level": "O0"})
     num_blocks, block_size, num_heads, head_size_k, head_size_v, batch, seq_len = input_param
     num_tokens = batch * seq_len
-    dtype = dtype
     format_type = 0 # 0-nd, 1-nz
     cu_seq_lens = True
     has_seq_starts = True
-    paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype,
+    paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2,
                               format_type, cu_seq_lens, has_seq_starts)
 
 @pytest.mark.level0
 @pytest.mark.platform_ascend910b
 @pytest.mark.env_onecard
-@pytest.mark.parametrize('dtype', [ms.float16, ms.int8, ms.bfloat16])
-@pytest.mark.parametrize('context_mode', [context.PYNATIVE_MODE])
-@pytest.mark.parametrize('input_param', [[128, 128, 16, 144, 128, 16, 1],
-                                         [256, 64, 16, 192, 128, 32, 1]])
-def test_paged_cache_load_nz(dtype, context_mode, input_param):
+@pytest.mark.parametrize('dtype1', [ms.float16, ms.int8, ms.bfloat16])
+@pytest.mark.parametrize('dtype2', [ms.float16, ms.int8, ms.bfloat16])
+@pytest.mark.parametrize('context_mode', [context.GRAPH_MODE, context.PYNATIVE_MODE])
+@pytest.mark.parametrize('input_param', [[128, 128, 16, 144, 128, 16, 1]])
+def test_paged_cache_load_nd_without_seq_starts(dtype1, dtype2, context_mode, input_param):
+    """
+    Feature: test paged_cache_load operator
+    Description: test paged_cache_load
+    Expectation: the result is correct
+    """
+    context.set_context(mode=context_mode, device_target="Ascend")
+    context.set_context(jit_config={"jit_level": "O0"})
+    num_blocks, block_size, num_heads, head_size_k, head_size_v, batch, seq_len = input_param
+    num_tokens = batch * seq_len
+    format_type = 0 # 0-nd, 1-nz
+    cu_seq_lens = False
+    has_seq_starts = False
+    paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2,
+                              format_type, cu_seq_lens, has_seq_starts)
+
+@pytest.mark.skip # TODO: A bug exists in the ontinuous running of ND and NZ formats (@sunhaochen)
+@pytest.mark.level0
+@pytest.mark.platform_ascend910b
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('dtype1', [ms.float16, ms.int8, ms.bfloat16])
+@pytest.mark.parametrize('dtype2', [ms.float16, ms.int8, ms.bfloat16])
+@pytest.mark.parametrize('context_mode', [context.GRAPH_MODE, context.PYNATIVE_MODE])
+@pytest.mark.parametrize('input_param', [[128, 128, 16, 144, 128, 16, 1]])
+def test_paged_cache_load_nz(dtype1, dtype2, context_mode, input_param):
     """
     Feature: test paged_cache_load operator
     Description: test paged_cache_load
@@ -260,9 +306,8 @@ def test_paged_cache_load_nz(dtype, context_mode, input_param):
     context.set_context(jit_config={"jit_level": "O0"})
     num_blocks, block_size, num_heads, head_size_k, head_size_v, batch, seq_len = input_param
     num_tokens = batch * seq_len
-    dtype = dtype
     format_type = 1 # 0-nd, 1-nz
     cu_seq_lens = False
     has_seq_starts = False
-    paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype,
+    paged_cache_load_function(num_tokens, num_heads, head_size_k, head_size_v, block_size, num_blocks, dtype1, dtype2,
                               format_type, cu_seq_lens, has_seq_starts)
diff --git a/yaml/doc/paged_cache_load_doc.yaml b/yaml/doc/paged_cache_load_doc.yaml
index 2ce3ecf08..c6156a845 100644
--- a/yaml/doc/paged_cache_load_doc.yaml
+++ b/yaml/doc/paged_cache_load_doc.yaml
@@ -17,16 +17,14 @@ paged_cache_load:
             seq_lens (Tensor): recording context length of each batch in two form:
              - length of each batch.  e.g. [1, 10, 5, 20]  shape is [batch]
              - accumulated sum of the length of each batch.  e.g. [0, 1, 11, 16, 36] shape is [batch+1]
-            key (Tensor): inplaced update. It is the key after concat. [num_tokens, num_heads, head_size_k]
-            value (Tensor): inplaced update. It is the value after concat. [num_tokens, num_heads, head_size_v]
             seq_starts (Tensor): Optional input, recording where sequence starts. [batch]
             kv_cache_cfg (int): default 0, 0->nd, 1->nz
             is_seq_lens_cumsum_type (bool): default false, when using seq_starts in ND, set it to True. Otherwise, false. 
             has_seq_starts (bool): default false, when using seq_starts in ND, set it to True. Otherwise, false.
 
         Returns:
-            key_out (Tensor): same address with input "key".
-            value_out (Tensor): same address with input "value"
+            key_out (Tensor): the key after concat [num_tokens, num_heads, head_size_k]
+            value_out (Tensor): the value after concat [num_tokens, num_heads, head_size_v]
 
         Supported Platforms:
             ``Ascend910B``
@@ -40,14 +38,13 @@ paged_cache_load:
             import ms_custom_ops
 
             class AsdPagedCacheLoadCustom(ms.nn.Cell):
-                def __init__(self):
-                    super().__init__()
-                
-                def construct(self, key_cache, value_cache, block_table, seq_lens, key, value, seq_starts, kv_cache_cfg,
-                              is_seq_lens_cumsum_type, has_seq_starts):
-                    return ms_custom_ops.paged_cache_load(key_cache, value_cache, block_table, seq_lens, key, value,
-                                                          seq_starts, kv_cache_cfg, is_seq_lens_cumsum_type,
-                                                          has_seq_starts)
+            def __init__(self):
+                super().__init__()
+        
+            def construct(self, key_cache, value_cache, block_table, seq_lens, seq_starts, kv_cache_cfg,
+                          is_seq_lens_cumsum_type, has_seq_starts):
+                return ms_custom_ops.paged_cache_load(key_cache, value_cache, block_table, seq_lens, seq_starts, kv_cache_cfg,
+                                              is_seq_lens_cumsum_type, has_seq_starts)
 
             ------------------------------------ ND INPUT WITH SEQ_STARTS -------------------------------------------------------------                                              
             # dtype is in [ms.float16, ms.bfloat16, ms.int8]
@@ -83,10 +80,6 @@ paged_cache_load:
             block_tables = np.array(block_tables).astype(np.int32)
             seq_starts = np.array(seq_starts).astype(np.int32)
             sum_context_lens = context_lens[-1]
-            key = np.zeros((sum_context_lens, num_heads, head_size_k)).astype(key_cache.dtype)
-            value = np.zeros((sum_context_lens, num_heads, head_size_v)).astype(value_cache.dtype)
-            key_tensor = Tensor(key).astype(dtype)
-            value_tensor = Tensor(value).astype(dtype)
 
             seq_starts_tensor = None if seq_starts is None else Tensor(seq_starts)
             net = AsdPagedCacheLoadCustom()
@@ -95,8 +88,6 @@ paged_cache_load:
                 Tensor(value_cache).astype(dtype),
                 Tensor(block_tables),
                 Tensor(context_lens),
-                key_tensor,
-                value_tensor,
                 seq_starts_tensor,
                 format_type, cu_seq_lens, has_seq_starts
             )
@@ -135,10 +126,6 @@ paged_cache_load:
             context_lens = np.array(context_lens).astype(np.int32)
             block_tables = np.array(block_tables).astype(np.int32)
             sum_context_lens = sum(context_lens)
-            key = np.zeros((sum_context_lens, num_heads * head_size_k)).astype(key_cache.dtype)
-            value = np.zeros((sum_context_lens, num_heads * head_size_v)).astype(value_cache.dtype)
-            key_tensor = Tensor(key).astype(dtype)
-            value_tensor = Tensor(value).astype(dtype)
             seq_starts_tensor = None if seq_starts is None else Tensor(seq_starts)
             net = AsdPagedCacheLoadCustom()
             key_out, value_out = net(
@@ -146,8 +133,6 @@ paged_cache_load:
                 Tensor(value_cache).astype(dtype),
                 Tensor(block_tables),
                 Tensor(context_lens),
-                key_tensor,
-                value_tensor,
                 seq_starts_tensor,
                 format_type, cu_seq_lens, has_seq_starts
             )
diff --git a/yaml/ms_kernels_internal/paged_cache_load_op.yaml b/yaml/ms_kernels_internal/paged_cache_load_op.yaml
index 309fdc70f..301e4773d 100644
--- a/yaml/ms_kernels_internal/paged_cache_load_op.yaml
+++ b/yaml/ms_kernels_internal/paged_cache_load_op.yaml
@@ -9,10 +9,6 @@ paged_cache_load:
       dtype: tensor
     seq_lens:
       dtype: tensor
-    key:
-      dtype: tensor
-    value:
-      dtype: tensor
     seq_starts:
       dtype: tensor
       default: None
@@ -25,16 +21,10 @@ paged_cache_load:
     has_seq_starts:
       dtype: bool
       default: false
-  args_signature:
-    rw_write: key, value
-  labels:
-    side_effect_mem: True
   returns:
     key_out:
       dtype: tensor
-      inplace: key
     value_out:
       dtype: tensor
-      inplace: value
   class:
     name: PagedCacheLoad
-- 
Gitee