From f835bc89c7f02bbc7b4be1d53079e2337d3359e3 Mon Sep 17 00:00:00 2001
From: yefeng <yefeng24@huawei.com>
Date: Sun, 19 Oct 2025 18:11:31 +0800
Subject: [PATCH] del code

---
 .../st/python/akg_custom_ops/ascend_akg.ini   |  14 -
 .../akg_custom_ops/test_paged_attention.py    | 326 ----------------
 .../test_paged_attention_mask.py              | 354 ------------------
 .../akg_custom_ops/test_reshape_and_cache.py  | 232 ------------
 .../optimize_pass/test_concat_op_pass.py      |  95 -----
 .../optimize_pass/test_padv3_ge_pass.py       | 123 ------
 .../plugin_custom_ops/test_kv_cache_mgr.py    | 166 --------
 7 files changed, 1310 deletions(-)
 delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini
 delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py
 delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py
 delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py
 delete mode 100644 mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py
 delete mode 100644 mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py
 delete mode 100644 mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py

diff --git a/mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini b/mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini
deleted file mode 100644
index 8d1ed76a..00000000
--- a/mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini
+++ /dev/null
@@ -1,14 +0,0 @@
-[ascend_context]
-provider=ge
-
-[ge_session_options]
-ge.externalWeight=1
-ge.exec.atomicCleanPolicy=1
-ge.event=notify
-ge.exec.formatMode=1
-ge.exec.precision_mode=must_keep_origin_dtype
-
-[graph_kernel_param]
-opt_level=2
-enable_cce_lib=true
-enable_cluster_ops_only=ReshapeAndCache,PagedAttention,PagedAttentionMask
\ No newline at end of file
diff --git a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py b/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py
deleted file mode 100644
index 712d87b1..00000000
--- a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# pylint: disable=C0330, C0326
-#
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test PagedAttention plugin custom ops.
-"""
-import os
-import math
-import random
-import logging
-import numpy as np
-import mindspore_lite as mslite
-from mindspore import nn
-from mindspore import Tensor, context, export
-from mindspore.ops.auto_generate.gen_ops_prim import PagedAttention
-
-MAX_SEQ_LEN = 1024
-
-
-class PagedAttentionNet(nn.Cell):
-    """
-    A single op network of PagedAttention.
-    """
-
-    def __init__(self, mp=None, strategy=None):
-        super().__init__()
-        self.n_head_no_use = 40
-        self.head_dim_no_use = 128
-        self.scale_value_no_use = 1 / math.sqrt(self.head_dim_no_use)
-        self.n_kv_head_no_use = 40
-        self.paged_attention = PagedAttention(
-            self.n_head_no_use, self.scale_value_no_use, self.n_kv_head_no_use
-        )
-        if strategy is not None:
-            self.paged_attention.shard(strategy)
-        elif mp is not None:
-            strategy = ((1, mp, 1), (1, 1, mp, 1), (1, 1, mp, 1), (1, 1), (1,))
-            self.paged_attention.shard(strategy)
-
-    def construct(self, query, key_cache, value_cache, block_tables, context_lens):
-        return self.paged_attention(
-            query, key_cache, value_cache, block_tables, context_lens
-        )
-
-
-def export_model() -> str:
-    """
-    export model with fixed shape.
-    """
-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-
-    num_tokens = 2
-    num_head = 32
-    head_dim = 128
-    kv_head = 16
-    num_blocks = 64
-    block_size = 128
-    max_num_blocks_per_batch = 8
-
-    q = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16))
-    key_cache = Tensor(
-        np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16)
-    )
-    value_cache = Tensor(
-        np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16)
-    )
-    block_tables = Tensor(
-        np.ones((num_tokens, max_num_blocks_per_batch), dtype=np.int32)
-    )
-    context_len = Tensor(np.ones((num_tokens,), dtype=np.int32))
-
-    file_name = "paged_attention"
-    net = PagedAttentionNet()
-    export(
-        net,
-        q,
-        key_cache,
-        value_cache,
-        block_tables,
-        context_len,
-        file_name=file_name,
-        file_format="MINDIR",
-    )
-    model_name = file_name + ".mindir"
-    assert os.path.exists(model_name)
-    return model_name
-
-
-def group_matmul(head, kv_head, a, b):
-    """
-    Calculte a group(for all heads) of MatMul.
-    """
-    group_num = head // kv_head
-    score = None
-    for i in range(kv_head):
-        group_score = np.matmul(
-            a[i * group_num : (i + 1) * group_num, :, :].astype(np.float32),
-            b[i : (i + 1), :, :].astype(np.float32),
-        ).astype(np.float16)
-        if score is None:
-            score = group_score
-        else:
-            score = np.concatenate((score, group_score), 0)
-    print(score.shape)
-    return score
-
-
-def ref_masked_attention(
-    query,  # (1, num_heads, head_size)
-    key,  # (context_len, kv_heads, head_size)
-    value,
-    scale: float,
-):
-    """
-    Implement masked attention with numpy.
-    """
-    # Q * K.T
-    query = query * scale
-    query = np.transpose(query, (1, 0, 2))  # 转置-> num_head, seqlen, head_size
-    key = np.transpose(key, (1, 2, 0))  # 转置 -> kv_heads, head_size, context_len
-    sim = group_matmul(query.shape[0], key.shape[0], query, key)
-    # softmax
-    row_max = np.max(sim, axis=-1, keepdims=True)
-    sim -= row_max
-    sim = sim.astype("float32")
-    sim = np.exp(sim)
-    row_sum = np.sum(sim, axis=-1, keepdims=True)
-    p = sim / row_sum
-    p = p.astype("float16")
-    # P * V
-    value = np.transpose(value, (1, 0, 2))  # 转置-> kv_heads, seqlen, head_size
-    out = group_matmul(query.shape[0], key.shape[0], p, value)
-    out = np.transpose(out, (1, 0, 2))  # 转置-> seqlen, num_head, head_size
-    return out
-
-
-def ref_single_query_cached_kv_attention(output, paged_input) -> None:
-    """
-    Implement single query attention with numpy.
-    """
-    query, key_cache, value_cache, block_tables, context_lens = paged_input
-    num_heads = query.shape[1]
-    kv_heads = value_cache.shape[2]
-    head_size = value_cache.shape[3]
-    block_size = value_cache.shape[1]
-
-    num_input_tokens = query.shape[0]
-    for i in range(num_input_tokens):
-        q = np.expand_dims(query[i], 0)
-        block_table = block_tables[i]
-        context_len = int(context_lens[i])
-
-        # 读取不同content_len的key和value，拼接在一起。
-        keys = []
-        values = []
-        for j in range(context_len):  # 单个序列总的block个数
-            block_number = int(block_table[j // block_size])
-            block_offset = j % block_size
-
-            k = key_cache[block_number, block_offset, :, :]
-            k = k.reshape(kv_heads, head_size)
-            keys.append(k)  # 读取key的内容
-
-            v = value_cache[block_number, block_offset, :, :]
-            v = v.reshape(kv_heads, head_size)
-            values.append(v)  # 读取value的内容
-        keys = np.stack(np.array(keys), axis=0)
-        values = np.stack(np.array(values), axis=0)
-        print(
-            f"query.shape: {q.shape}, {q.dtype}, keys.shape: {keys.shape}, "
-            f"context_len: {context_len}, keyblocknum: {(context_len + block_size - 1) // block_size}, "
-            f"tail: {context_len % block_size}"
-        )
-        scale = 1.0 / (head_size**0.5)  # 1/sqrt(d)
-
-        out = ref_masked_attention(q, keys, values, scale)  # 计算attention
-
-        out = out.reshape(num_heads, head_size)  # 2D输出
-        output[i] = out
-
-
-def create_golden_data(num_tokens=2, kv_heads=16, block_size=128, num_blocks=64):
-    """
-    Create golden data for PagedAttention op.
-    """
-    num_heads = 32
-    head_size = 128
-    dtype = "float16"
-    query = np.random.uniform(
-        -1.0, 1.0, size=(num_tokens, num_heads, head_size)
-    ).astype(dtype)
-
-    # key value cache: (num_blocks, block_size, num_heads, head_size)
-    key_cache = np.random.uniform(
-        -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size)
-    ).astype(dtype)
-    value_cache = np.random.uniform(
-        -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size)
-    ).astype(dtype)
-
-    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_tokens)]
-    # context_lens = [1024] * num_tokens # 每个batch对应的seqlen
-    _ = [
-        print(f"context_len: {x} % {block_size} == 1")
-        for x in context_lens
-        if x % block_size == 1
-    ]
-    max_context_len = max(context_lens)
-
-    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
-    block_tables = []  # （num_tokens, max_num_blocks_per_seq）
-    for i in range(num_tokens):
-        n_block = (context_lens[i] + block_size - 1) // block_size
-        print(f"n_block {i} = {n_block}")
-        n_pad_block = max_num_blocks_per_seq - n_block
-        block_table = [
-            random.randint(0, num_blocks - 1)
-            for _ in range(n_block)  # 给方块里面的每一个Block都分配了显存
-        ]
-        if n_pad_block != 0:
-            block_table = block_table + ([-1] * n_pad_block)
-        print(f"block table {i} = {block_table}")
-        block_tables.append(block_table)
-
-    context_lens = np.array(context_lens).astype(np.int32)
-    block_tables = np.array(block_tables).astype(np.int32)
-
-    paged_input = [query, key_cache, value_cache, block_tables, context_lens]
-    ref_output = np.zeros_like(query)
-
-    # 计算输出
-    ref_single_query_cached_kv_attention(ref_output, paged_input)
-
-    print(f"==> query shape: {query.shape}, data: \n{query}")
-    print(f"==> key_cache shape: {key_cache.shape}")
-    print(f"==> value_cache shape: {value_cache.shape}")
-    print(f"==> block_tables shape: {block_tables.shape}, data: \n{block_tables}")
-    print(f"==> context_lens shape: {context_lens.shape}, data: \n{context_lens}")
-    print("data generate finished!")
-    ref_outputs = [ref_output]
-    return paged_input, ref_outputs
-
-
-def do_mslite_infer(model_file, in_tensors):
-    """
-    Do model inference with mslite.
-    """
-    print(model_file)
-    lite_context = mslite.Context()
-    lite_context.target = ["ascend"]
-    lite_context.ascend.device_id = 2
-    lite_context.ascend.provider = "ge"
-    lite_context.ascend.rank_id = 0
-    model = mslite.Model()
-
-    script_dir = os.path.dirname(__file__)
-    config_path = os.path.join(script_dir, "ascend_akg.ini")
-    print(f"Use config file: {config_path}")
-    model.build_from_file(
-        model_file, mslite.ModelType.MINDIR, lite_context, config_path=config_path
-    )
-
-    outputs = model.predict(in_tensors)
-    np_output: list[np.ndarray] = []
-    for output in outputs:
-        np_output.append(output.get_data_to_numpy())
-        print("outputs' shape: ", np_output[-1].shape)
-    print("finish------------------")
-    return np_output
-
-
-def inference_model(mindir_model: str):
-    """
-    Inference model.
-    """
-    inputs, ref_outputs = create_golden_data()
-
-    # 运行昇腾算子
-    in_tensors = [mslite.Tensor(x) for x in inputs]
-    ascend_outputs = do_mslite_infer(mindir_model, in_tensors)
-
-    for i, ascend_output in enumerate(ascend_outputs):
-        is_close = np.allclose(ref_outputs[i], ascend_output, rtol=1e-3, atol=1e-03)
-        logging.info("ref_outputs %d:\n%s", i, ref_outputs[i])
-        logging.info("ascend_outputs %d:\n%s", i, ascend_output)
-        logging.info("ascend output %d is equal to ref output: %s", i, is_close)
-        assert is_close
-
-
-def test_paged_attention_fixed_shape():
-    """
-    Test PagedAttention of fixed shape.
-    """
-    model_path = export_model()
-    print(f"paged_attention_fixed_shape st : export success to path: {model_path}")
-    logging.info(
-        "paged_attention_fixed_shape st : export success to path: {%s}", model_path
-    )
-
-    model_path = "paged_attention.mindir"
-
-    inference_model(model_path)
-    print("paged_attention_fixed_shape st : inference success.")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s",
-        filename="./test.log",
-        filemode="w",
-    )
-    test_paged_attention_fixed_shape()
diff --git a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py b/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py
deleted file mode 100644
index d7468887..00000000
--- a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# pylint: disable=C0330, C0326
-#
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test PagedAttentionMask plugin custom ops.
-"""
-import os
-import math
-import random
-import logging
-import numpy as np
-import mindspore_lite as mslite
-from mindspore import nn
-from mindspore import Tensor, context, export
-from mindspore.ops.auto_generate.gen_ops_prim import PagedAttentionMask
-
-MAX_SEQ_LEN = 1024
-
-
-class PagedAttentionMaskNet(nn.Cell):
-    """
-    A single op network of PagedAttentionMask.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.n_head_no_use = 40
-        self.head_dim_no_use = 128
-        self.scale_value_no_use = 1 / math.sqrt(self.head_dim_no_use)
-        self.n_kv_head_no_use = 40
-        self.paged_attention_mask = PagedAttentionMask(
-            self.n_head_no_use, self.scale_value_no_use, self.n_kv_head_no_use
-        )
-
-    def construct(
-        self, query, key_cache, value_cache, block_tables, context_lens, alibi_mask
-    ):
-        return self.paged_attention_mask(
-            query, key_cache, value_cache, block_tables, context_lens, alibi_mask
-        )
-
-
-def export_model() -> str:
-    """
-    Export model with fixed shape.
-    """
-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-
-    num_tokens = 2
-    num_head = 32
-    head_dim = 128
-    kv_head = 16
-    num_blocks = 64
-    block_size = 128
-    max_num_blocks_per_batch = 8
-
-    q = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16))
-    key_cache = Tensor(
-        np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16)
-    )
-    value_cache = Tensor(
-        np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16)
-    )
-    block_tables = Tensor(
-        np.ones((num_tokens, max_num_blocks_per_batch), dtype=np.int32)
-    )
-    context_len = Tensor(np.ones((num_tokens,), dtype=np.int32))
-    alibi_mask = Tensor(
-        np.ones((num_tokens, num_head, 1, max_num_blocks_per_batch), dtype=np.float16)
-    )
-
-    file_name = "paged_attention_mask"
-    net = PagedAttentionMaskNet()
-    export(
-        net,
-        q,
-        key_cache,
-        value_cache,
-        block_tables,
-        context_len,
-        alibi_mask,
-        file_name=file_name,
-        file_format="MINDIR",
-    )
-    model_name = file_name + ".mindir"
-    assert os.path.exists(model_name)
-    return model_name
-
-
-def group_matmul(head, kv_head, a, b):
-    """
-    Calculte a group(for all heads) of MatMul.
-    """
-    group_num = head // kv_head
-    score = None
-    for i in range(kv_head):
-        group_score = np.matmul(
-            a[i * group_num : (i + 1) * group_num, :, :].astype(np.float32),
-            b[i : (i + 1), :, :].astype(np.float32),
-        ).astype(np.float16)
-        if score is None:
-            score = group_score
-        else:
-            score = np.concatenate((score, group_score), 0)
-    print(score.shape)
-    return score
-
-
-def ref_masked_attention(
-    query,  # (1, num_heads, head_size)
-    key,  # (context_len, kv_heads, head_size)
-    value,
-    scale: float,
-    alibi_bias,
-):
-    """
-    Implement masked attention with numpy.
-    """
-    # Q * K.T
-    query = query * scale
-    query = np.transpose(query, (1, 0, 2))  # 转置-> num_head, seqlen, head_size
-    key = np.transpose(key, (1, 2, 0))  # 转置 -> kv_heads, head_size, context_len
-    sim = group_matmul(query.shape[0], key.shape[0], query, key)
-    sim = sim + alibi_bias
-
-    # softmax
-    row_max = np.max(sim, axis=-1, keepdims=True)
-    sim -= row_max
-    sim = sim.astype("float32")
-    sim = np.exp(sim)
-    row_sum = np.sum(sim, axis=-1, keepdims=True)
-    p = sim / row_sum
-    p = p.astype("float16")
-    # P * V
-    value = np.transpose(value, (1, 0, 2))  # 转置-> kv_heads, seqlen, head_size
-    out = group_matmul(query.shape[0], key.shape[0], p, value)
-    out = np.transpose(out, (1, 0, 2))  # 转置-> seqlen, num_head, head_size
-    return out
-
-
-def ref_single_query_cached_kv_attention(output, paged_input) -> None:
-    """
-    Implement single query attention with numpy.
-    """
-    query, key_cache, value_cache, block_tables, context_lens, alibi_mask = paged_input
-    num_heads = query.shape[1]
-    kv_heads = value_cache.shape[2]
-    head_size = value_cache.shape[3]
-    block_size = value_cache.shape[1]
-
-    num_input_tokens = query.shape[0]
-    for i in range(num_input_tokens):
-        q = np.expand_dims(query[i], 0)
-        block_table = block_tables[i]
-        context_len = int(context_lens[i])
-
-        # 读取不同content_len的key和value，拼接在一起。
-        keys = []
-        values = []
-        for j in range(context_len):  # 单个序列总的block个数
-            block_number = int(block_table[j // block_size])
-            block_offset = j % block_size
-
-            k = key_cache[block_number, block_offset, :, :]
-            k = k.reshape(kv_heads, head_size)
-            keys.append(k)  # 读取key的内容
-
-            v = value_cache[block_number, block_offset, :, :]
-            v = v.reshape(kv_heads, head_size)
-            values.append(v)  # 读取value的内容
-        keys = np.stack(np.array(keys), axis=0)
-        values = np.stack(np.array(values), axis=0)
-        print(
-            f"query.shape: {q.shape}, {q.dtype}, keys.shape: {keys.shape}, "
-            f"context_len: {context_len}, keyblocknum: {(context_len + block_size - 1) // block_size}, "
-            f"tail: {context_len % block_size}, alibi_bias.shape: {alibi_mask[i].shape}"
-        )
-        scale = 1.0 / (head_size**0.5)  # 1/sqrt(d)
-
-        out = ref_masked_attention(
-            q, keys, values, scale, alibi_mask[i, :, :, :context_len]
-        )  # 计算attention
-
-        out = out.reshape(num_heads, head_size)  # 2D输出
-        output[i] = out
-
-
-def create_golden_data(num_tokens=2, kv_heads=16, block_size=128, num_blocks=64):
-    """
-    Create golden data for PagedAttentionMask op.
-    """
-    num_heads = 32
-    head_size = 128
-    dtype = "float16"
-    query = np.random.uniform(
-        -1.0, 1.0, size=(num_tokens, num_heads, head_size)
-    ).astype(dtype)
-
-    # key value cache: (num_blocks, block_size, num_heads, head_size)
-    key_cache = np.random.uniform(
-        -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size)
-    ).astype(dtype)
-    value_cache = np.random.uniform(
-        -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size)
-    ).astype(dtype)
-
-    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_tokens)]
-    # context_lens = [1024] * num_tokens # 每个batch对应的seqlen
-    _ = [
-        print(f"context_len: {x} % {block_size} == 1")
-        for x in context_lens
-        if x % block_size == 1
-    ]
-    max_context_len = max(context_lens)
-
-    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
-    block_tables = []  # （num_tokens, max_num_blocks_per_seq）
-    for i in range(num_tokens):
-        n_block = (context_lens[i] + block_size - 1) // block_size
-        print(f"n_block {i} = {n_block}")
-        n_pad_block = max_num_blocks_per_seq - n_block
-        block_table = [
-            random.randint(0, num_blocks - 1)
-            for _ in range(n_block)  # 给方块里面的每一个Block都分配了显存
-        ]
-        if n_pad_block != 0:
-            block_table = block_table + ([-1] * n_pad_block)
-        print(f"block table {i} = {block_table}")
-        block_tables.append(block_table)
-
-    context_lens = np.array(context_lens).astype(np.int32)
-    block_tables = np.array(block_tables).astype(np.int32)
-
-    # alibi mask
-    alibi_slopes = np.random.random(num_heads).astype(np.float16)
-    alibi_mask = np.zeros((num_tokens, num_heads, 1, max_context_len), dtype=np.float16)
-    for i, context_len in enumerate(context_lens):
-        position_ids = np.arange(context_len).astype(np.int32)
-        alibi_bias = (position_ids - context_len + 1).astype(
-            np.float16
-        )  # -context_len+1, -context_len+2,..,0
-        alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(
-            1, 1, -1
-        )  # (head_num, 1, context)
-        alibi_mask[i, :, :, :context_len] = alibi_bias
-    print(f"alibi_mask.shape = {alibi_mask.shape}")
-
-    paged_input = [
-        query,
-        key_cache,
-        value_cache,
-        block_tables,
-        context_lens,
-        alibi_mask,
-    ]
-    ref_output = np.zeros_like(query)
-
-    # 计算输出
-    ref_single_query_cached_kv_attention(ref_output, paged_input)
-
-    print(f"==> query shape: {query.shape}, data: \n{query}")
-    print(f"==> key_cache shape: {key_cache.shape}")
-    print(f"==> value_cache shape: {value_cache.shape}")
-    print(f"==> block_tables shape: {block_tables.shape}, data: \n{block_tables}")
-    print(f"==> context_lens shape: {context_lens.shape}, data: \n{context_lens}")
-    print(f"==> alibi_mask shape: {alibi_mask.shape}, data: \n{alibi_mask}")
-    print("data generate done!")
-    ref_outputs = [ref_output]
-    return paged_input, ref_outputs
-
-
-def do_mslite_infer(model_file, in_tensors):
-    """
-    Do model inference with mslite.
-    """
-    print(model_file)
-    lite_context = mslite.Context()
-    lite_context.target = ["ascend"]
-    lite_context.ascend.device_id = 2
-    lite_context.ascend.provider = "ge"
-    lite_context.ascend.rank_id = 0
-    model = mslite.Model()
-
-    script_dir = os.path.dirname(__file__)
-    config_path = os.path.join(script_dir, "ascend_akg.ini")
-    print(f"Use config file: {config_path}")
-    model.build_from_file(
-        model_file, mslite.ModelType.MINDIR, lite_context, config_path=config_path
-    )
-
-    outputs = model.predict(in_tensors)
-    np_output: list[np.ndarray] = []
-    for output in outputs:
-        np_output.append(output.get_data_to_numpy())
-        print("outputs' shape: ", np_output[-1].shape)
-    print("finish------------------")
-    return np_output
-
-
-def inference_model(mindir_model: str):
-    """
-    Inference model.
-    """
-    inputs, ref_outputs = create_golden_data()
-
-    # 运行昇腾算子
-    in_tensors = [mslite.Tensor(x) for x in inputs]
-    ascend_outputs = do_mslite_infer(mindir_model, in_tensors)
-
-    for i, ascend_output in enumerate(ascend_outputs):
-        is_close = np.allclose(ref_outputs[i], ascend_output, rtol=1e-3, atol=1e-03)
-        logging.info("ref_outputs %d:\n%s", i, ref_outputs[i])
-        logging.info("ascend_outputs %d:\n%s", i, ascend_output)
-        logging.info("ascend output %d is equal to ref output: %s", i, is_close)
-        assert is_close
-
-
-def test_paged_attention_mask_fixed_shape():
-    """
-    Test PagedAttentionMask of fixed shape.
-    """
-    model_path = export_model()
-    print(f"paged_attention_fixed_shape st : export success to path: {model_path}")
-    logging.info(
-        "paged_attention_mask_fixed_shape st : export success to path: %s", model_path
-    )
-
-    model_path = "paged_attention_mask.mindir"
-
-    inference_model(model_path)
-    print("paged_attention_mask_fixed_shape st : inference success.")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s",
-        filename="./test.log",
-        filemode="w",
-    )
-    test_paged_attention_mask_fixed_shape()
diff --git a/mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py b/mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py
deleted file mode 100644
index b52e13a9..00000000
--- a/mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test ReshapeAndCache plugin custom ops.
-"""
-import os
-import logging
-from typing import List
-import numpy as np
-import mindspore_lite as mslite
-from mindspore import nn, ops
-from mindspore import Tensor, Parameter, context, export
-from mindspore.ops.auto_generate.gen_ops_prim import ReshapeAndCache
-
-
-class ReshapeAndCacheNet(nn.Cell):
-    """
-    ReshapeAndCacheNet.
-    """
-
-    def __init__(self, num_blocks, block_size, kv_head, head_dim):
-        super().__init__()
-        self.key_cache = Parameter(
-            np.zeros((num_blocks, block_size, kv_head, head_dim), dtype=np.float16),
-            name="key_cache",
-        )
-        self.value_cache = Parameter(
-            np.zeros((num_blocks, block_size, kv_head, head_dim), dtype=np.float16),
-            name="value_cache",
-        )
-        self.reshape_and_cache = ReshapeAndCache()
-        self.depends = ops.Depend()
-
-    def construct(self, key, value, slot_mapping):
-        out_key = self.reshape_and_cache(
-            key, value, self.key_cache, self.value_cache, slot_mapping
-        )
-        out_key_cache = self.depends(self.key_cache, out_key)
-        out_value_cache = self.depends(self.value_cache, out_key)
-        return out_key, out_key_cache, out_value_cache
-
-
-def export_model() -> str:
-    """
-    Export model with fixed shape.
-    """
-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-
-    num_tokens = 512
-    num_head = 40
-    head_dim = 128
-    block_size = 16
-    num_blocks = 128
-
-    key = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16))
-    value = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16))
-    slot_mapping = Tensor(np.ones((num_tokens,), dtype=np.int32))
-
-    file_name = "reshape_and_cache"
-    net = ReshapeAndCacheNet(num_blocks, block_size, num_head, head_dim)
-    export(net, key, value, slot_mapping, file_name=file_name, file_format="MINDIR")
-    model_name = file_name + ".mindir"
-    assert os.path.exists(model_name)
-    return model_name
-
-
-def ref_reshape_and_cache(key, value, key_cache, value_cache, slot_mapping):
-    """
-    Implement reshape_and_cache with numpy.
-    """
-    _, block_size, _, _ = key_cache.shape
-    key_out = np.zeros_like(key)
-    for i, slot_idx in enumerate(slot_mapping):
-        if slot_idx == -1:  # skip special pad slot index -1.
-            continue
-        block_index = slot_idx // block_size
-        block_offset = slot_idx % block_size
-        key_cache[block_index, block_offset, :, :] = key[i, :, :]
-        value_cache[block_index, block_offset, :, :] = value[i, :, :]
-        key_out[i, :, :] = key[i, :, :]
-    return (key_out, key_cache, value_cache)
-
-
-def create_input(cache_shape: List[int], update_shape: List[int], with_pad_slot: bool):
-    """
-    Create numpy input data for ReshapeAndCache op.
-    """
-    key = np.random.rand(*update_shape).astype(np.float16)
-    value = np.random.rand(*update_shape).astype(np.float16)
-
-    key_cache = np.zeros(cache_shape).astype(np.float16)
-    value_cache = np.zeros(cache_shape).astype(np.float16)
-
-    num_blocks = cache_shape[0]
-    block_size = cache_shape[1]
-    total_num_slots = num_blocks * block_size
-
-    num_tokens = update_shape[0]
-    if with_pad_slot:
-        # construct a slot mapping case like: [x, ..., z, -1, ..., -1]
-        num_valid_token = num_tokens // 2
-        num_pad_token = num_tokens - num_valid_token
-        slot_mapping = np.random.choice(
-            np.arange(0, total_num_slots), size=num_valid_token, replace=False
-        ).astype(np.int32)
-        pad_slots = np.array([-1 for _ in range(num_pad_token)], dtype=np.int32)
-        slot_mapping = np.concatenate((slot_mapping, pad_slots), axis=0)
-    else:
-        slot_mapping = np.random.choice(
-            np.arange(0, total_num_slots), size=num_tokens, replace=False
-        ).astype(np.int32)
-
-    return key, value, key_cache, value_cache, slot_mapping
-
-
-def create_golden_data(with_pad_slot: bool):
-    """
-    Create golden data for ReshapeAndCache op.
-    """
-    num_tokens = 512
-    num_blocks = 128
-    block_size = 16
-    num_head = 40
-    head_dim = 128
-
-    cache_shape = [num_blocks, block_size, num_head, head_dim]
-    update_shape = [num_tokens, num_head, head_dim]
-    key, value, key_cache, value_cache, slot_mapping = create_input(
-        cache_shape, update_shape, with_pad_slot
-    )
-    logging.info("slot_mapping shape: %s, data:\n%s", slot_mapping.shape, slot_mapping)
-
-    # generate golden output with numpy op implement.
-    key_golden, key_cahce_gloden, value_cache_golden = ref_reshape_and_cache(
-        key, value, key_cache, value_cache, slot_mapping
-    )
-
-    inputs = [key, value, slot_mapping]
-    ref_outputs = [key_golden, key_cahce_gloden, value_cache_golden]
-
-    return inputs, ref_outputs
-
-
-def do_mslite_infer(model_file, in_tensors):
-    """
-    Do model inference with mslite.
-    """
-    lite_context = mslite.Context()
-    lite_context.target = ["ascend"]
-    lite_context.ascend.device_id = 2
-    lite_context.ascend.provider = "ge"
-    lite_context.ascend.rank_id = 0
-    model = mslite.Model()
-
-    script_dir = os.path.dirname(__file__)
-    config_path = os.path.join(script_dir, "ascend_akg.ini")
-    print(f"Use config file: {config_path}")
-    model.build_from_file(
-        model_file, mslite.ModelType.MINDIR, lite_context, config_path=config_path
-    )
-
-    out_tensors = model.predict(in_tensors)
-    np_output = [tensor.get_data_to_numpy() for tensor in out_tensors]
-    return np_output
-
-
-def inference_model(mindir_model: str, with_pad_slot: bool):
-    """
-    Inference model
-    """
-    inputs, ref_outputs = create_golden_data(with_pad_slot)
-    # 运行昇腾算子
-    in_tensors = [mslite.Tensor(x) for x in inputs]
-    ascend_outputs = do_mslite_infer(mindir_model, in_tensors)
-
-    for i, ascend_output in enumerate(ascend_outputs):
-        is_close = np.allclose(ref_outputs[i], ascend_output, rtol=1e-3, atol=1e-03)
-        logging.info("ref_outputs %d:\n%s", i, ref_outputs[i])
-        logging.info("ascend_outputs %d:\n%s", i, ascend_output)
-        logging.info("ascend output %d is equal to ref output: %s", i, is_close)
-        assert is_close
-
-
-def test_reshape_and_cache_fixed_shape():
-    """
-    Test ReshapAndCache of fixed shape.
-    """
-    model_path = export_model()
-    print(f"reshape_and_cache_dynamic_shape st : export success to path: {model_path}")
-    logging.info(
-        "reshape_and_cache_dynamic_shape st : export success to path: %s", model_path
-    )
-
-    inference_model(model_path, with_pad_slot=False)
-    print("reshape_and_cache_dynamic_shape st : inference success.")
-
-
-def test_reshape_and_cache_skip_pad_slot():
-    """
-    Test ReshapAndCache with skipping index: -1.
-    """
-    model_path = export_model()
-    print(f"reshape_and_cache_skip_pad_slot st : export success to path: {model_path}")
-    logging.info(
-        "reshape_and_cache_skip_pad_slot st : export success to path: %s", model_path
-    )
-
-    inference_model(model_path, with_pad_slot=True)
-    print("reshape_and_cache_skip_pad_slot st : inference success.")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s",
-        filename="./test.log",
-        filemode="w",
-    )
-    test_reshape_and_cache_fixed_shape()
-    test_reshape_and_cache_skip_pad_slot()
diff --git a/mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py b/mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py
deleted file mode 100644
index b8229dea..00000000
--- a/mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test concat op pass.
-"""
-import os
-import sys
-import time
-import numpy as np
-import mindspore_lite as mslite
-import mindspore as ms
-from mindspore import Tensor, ops, nn
-import mindspore.common.dtype as mstype
-from mindspore import context
-
-
-class ConcatOpPassNet(nn.Cell):
-    """
-    KVCacheMgrNet.
-    """
-    def __init__(self):
-        super().__init__()
-        self.pad = ops.PadV3()
-        self.concat = ops.Concat(axis=0)
-
-    def construct(self, key):
-        pad_length = key.astype(mstype.int64)
-        key_paddings = self.concat((Tensor([0, 0, 0, 0, 0], mstype.int64), pad_length, Tensor([0, 0], mstype.int64)))
-        return key_paddings
-
-
-def dummy_tensor(shape, dtype):
-    """create dummy tensor"""
-    if None in shape:
-        return Tensor(shape=shape, dtype=dtype)
-    return Tensor(np.ones(shape=tuple(shape)), dtype=dtype)
-
-
-def export_model():
-    """
-    export model
-    """
-    in_key = dummy_tensor(shape=[None], dtype=ms.int64)
-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-
-    net = ConcatOpPassNet()
-    file_name = "concat_op_pass"
-    ms.export(net, in_key, file_name=file_name, file_format="MINDIR")
-    model_name = file_name + ".mindir"
-    assert os.path.exists(model_name)
-    return model_name
-
-
-def inference():
-    """
-    def inference_concat_op_pass
-    """
-    time_start_total = time.time()
-    model_path = export_model()
-
-    lite_ctx0 = mslite.Context()
-    lite_ctx0.target = ["ascend"]
-    lite_ctx0.ascend.device_id = 0
-    lite_ctx0.ascend.provider = "ge"
-    model = mslite.Model()
-    model.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx0)
-    # warm up
-    np_data = np.ones((1), np.int64)
-    outputs = model.predict([mslite.Tensor(np_data)])
-    result = np.array([0, 0, 0, 0, 0, 1, 0, 0])
-    os.remove(model_path)
-    print(f"predict cost total {(time.time() - time_start_total)*1000} ms", flush=True)
-    assert (outputs[0].get_data_to_numpy() == result).all()
-
-
-if __name__ == '__main__':
-    print("test_concat_op_pass.py: begin run testcases.")
-    backend = sys.argv[1]
-    if backend == "Ascend":
-        inference()
-    else:
-        print(f'test_concat_op_pass.py: skip backend {backend}!')
-    print("test_concat_op_pass.py: run testcases success.")
diff --git a/mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py b/mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py
deleted file mode 100644
index d3b9b097..00000000
--- a/mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test concat op pass.
-"""
-import os
-import sys
-import time
-import numpy as np
-import mindspore_lite as mslite
-import mindspore as ms
-from mindspore import Tensor, ops, nn
-import mindspore.common.dtype as mstype
-from mindspore import context
-
-
-class PadV3GePassNet(nn.Cell):
-    """
-    PadV3GePassNet.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.pad = ops.PadV3()
-        self.concat = ops.Concat(axis=0)
-        self.sub = ops.Sub()
-        self.up_to_len = 10
-
-    def construct(self, key):
-        pad_length = (
-            self.sub(ms.Tensor(self.up_to_len, mstype.int32), ops.dyn_shape(key)[-2])
-            .reshape((1,))
-            .astype(mstype.int32)
-        )
-        key_paddings = self.concat(
-            (
-                Tensor([0, 0, 0], mstype.int32),
-                pad_length,
-                Tensor([0, 0, 0, 0], mstype.int32),
-            )
-        )
-        key_present = self.pad(key, key_paddings, Tensor(0, mstype.float16))
-        return key_present
-
-
-def dummy_tensor(shape, dtype):
-    """create dummy tensor"""
-    if None in shape:
-        return Tensor(shape=shape, dtype=dtype)
-    return Tensor(np.ones(shape=tuple(shape)), dtype=dtype)
-
-
-def export_model():
-    """
-    export model
-    """
-    in_key = dummy_tensor(shape=[1, 2, None, 2], dtype=ms.float16)
-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-
-    net = PadV3GePassNet()
-    file_name = "padv3_ge_pass"
-    ms.export(net, in_key, file_name=file_name, file_format="MINDIR")
-    model_name = file_name + ".mindir"
-    assert os.path.exists(model_name)
-    return model_name
-
-
-def inference_common():
-    """
-    def inference_padv3_ge_pass
-    """
-    time_start_total = time.time()
-    model_path = export_model()
-
-    lite_ctx0 = mslite.Context()
-    lite_ctx0.target = ["ascend"]
-    lite_ctx0.ascend.device_id = 0
-    lite_ctx0.ascend.provider = "ge"
-    model = mslite.Model()
-    model.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx0)
-
-    # input data
-    np_data = np.zeros((1, 2, 4, 2), np.float16)
-    # change the first 2 elements of the third dimension to 1
-    np_data[:, :, :2, :] = 1.0
-    outputs = model.predict([mslite.Tensor(np_data)])
-    result = np.zeros((1, 2, 10, 2), np.float16)
-    # change the first 2 elements of the third dimension to 1
-    result[:, :, :2, :] = 1.0
-    os.remove(model_path)
-    print(f"predict cost total {(time.time() - time_start_total)*1000} ms", flush=True)
-    print("gold shape:")
-    print(result.shape)
-    print(result)
-    print("model output shape:")
-    np_out = outputs[0].get_data_to_numpy()
-    print(np_out.shape)
-    print(np_out)
-
-    assert np_out.shape == result.shape
-    assert (np_out == result).all()
-
-
-if __name__ == "__main__":
-    print("test_padv3_ge_pass.py: begin run testcases.")
-    backend = sys.argv[1]
-    if backend == "Ascend":
-        inference_common()
-    else:
-        print(f"test_padv3_ge_pass.py: skip backend {backend}!")
-    print("test_padv3_ge_pass.py: run testcases success.")
diff --git a/mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py b/mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py
deleted file mode 100644
index 80a52a8f..00000000
--- a/mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2023 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""
-Test KVCacheMgr plugin custom ops.
-"""
-import os
-import sys
-import time
-import numpy as np
-import mindspore_lite as mslite
-import mindspore as ms
-import mindspore.nn as nn
-from mindspore.ops import operations as P
-from mindspore.ops import functional as F
-from mindspore.train.serialization import export
-from mindspore import Tensor
-from mindspore import context
-
-
-class KVCacheMgrNet(nn.Cell):
-    """
-    KVCacheMgrNet.
-    """
-    def __init__(self, batch_size, src_seq_length):
-        super().__init__()
-        self.mul = P.Mul()
-        self.add = P.Add()
-        self.tile = P.Tile()
-        self.expand_dims = P.ExpandDims()
-        self.dtype = ms.float16
-
-        seq_range = np.arange(src_seq_length).reshape(1, 1, -1)
-        self.range = Tensor(np.tile(seq_range, (batch_size, 1, 1)), ms.int32)
-        self.equal = P.Equal()
-        self.sub = P.Sub()
-
-    def construct(self, key_past, key, value_past, value, batch_valid_length):
-        current_index = F.reshape(batch_valid_length, (-1, 1, 1))
-        current_mask = F.cast(self.equal(self.range, current_index), self.dtype)
-        # Pad the key and value to seq_length with only the position index not zero
-        current_key = self.mul(key, self.expand_dims(current_mask, 3))
-        current_value = self.mul(value, self.expand_dims(current_mask, 3))
-        # Concat the previous saved state and current state
-        key = self.add(key_past, current_key)
-        value = self.add(value_past, current_value)
-
-        ans = self.sub(key, value)
-        return ans
-
-
-def create_shapes():
-    batch_size = 1
-    num_head = 40
-    seq_length = 1024
-    update_seq_length = 1
-    size_pre_head = 128
-    past_shape = (batch_size, num_head, seq_length, size_pre_head)
-    cur_shape = (batch_size, num_head, update_seq_length, size_pre_head)
-    return past_shape, cur_shape
-
-
-def create_inputs():
-    """
-    create inputs.
-    """
-    past_shape, cur_shape = create_shapes()
-
-    key_past = Tensor(np.random.rand(*past_shape), ms.float16)
-    key_cur = Tensor(np.random.rand(*cur_shape), ms.float16)
-    value_past = Tensor(np.random.rand(*past_shape), ms.float16)
-    value_cur = Tensor(np.random.rand(*cur_shape), ms.float16)
-    index = Tensor(shape=(1,), dtype=ms.int32, init=1)
-    return (key_past, key_cur, value_past, value_cur, index)
-
-
-def create_lite_inputs():
-    """
-    create lite inputs.
-    """
-    past_shape, cur_shape = create_shapes()
-
-    key_past = mslite.Tensor(np.zeros(past_shape, np.float16))
-    key_cur = mslite.Tensor(np.random.rand(*cur_shape).astype(np.float16))
-    value_past = mslite.Tensor(np.zeros(past_shape, np.float16))
-    value_cur = mslite.Tensor(np.random.rand(*cur_shape).astype(np.float16))
-    index = mslite.Tensor(np.ones(1).astype(np.int32))
-    return (key_past, key_cur, value_past, value_cur, index)
-
-
-def export_model():
-    """
-    export model
-    """
-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-    key_past, key_cur, value_past, value_cur, index = create_inputs()
-    batch_size = key_past.shape[0]
-    src_seq_length = key_past.shape[-2]
-
-    net = KVCacheMgrNet(batch_size, src_seq_length)
-    file_name = "kv_cache_mgr_net"
-
-    export(net, key_past, key_cur, value_past, value_cur,
-           index, file_name=file_name, file_format='MINDIR')
-    model_name = file_name + ".mindir"
-    assert os.path.exists(model_name)
-    return model_name
-
-
-def inference_kv_cache_mgr():
-    """
-    def inference_kv_cache_mgr
-    """
-    time_start_total = time.time()
-    model_path = export_model()
-    input_lists = list(create_lite_inputs())
-
-    lite_ctx0 = mslite.Context()
-    lite_ctx0.target = ["ascend"]
-    lite_ctx0.ascend.device_id = 0
-    lite_ctx0.ascend.provider = "ge"
-    model0 = mslite.Model()
-    model0.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx0)
-    # warm up
-    outputs0 = model0.predict(input_lists)
-    time_start = time.time()
-    outputs0 = model0.predict(input_lists)
-    print(f"predict plugin_custom_ops=None cost {(time.time() - time_start)*1000} ms", flush=True)
-
-    lite_ctx1 = mslite.Context()
-    lite_ctx1.target = ["ascend"]
-    lite_ctx1.ascend.device_id = 0
-    lite_ctx1.ascend.provider = "ge"
-    dict1 = {"ascend_context": {"plugin_custom_ops": "All"}}
-    model1 = mslite.Model()
-    model1.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx1, "", dict1)
-    # warm up
-    outputs1 = model1.predict(input_lists)
-    time_start = time.time()
-    outputs1 = model1.predict(input_lists)
-    print(f"predict plugin_custom_ops=All cost {(time.time() - time_start)*1000} ms", flush=True)
-
-    os.remove(model_path)
-    print(f"predict cost total {(time.time() - time_start_total)*1000} ms", flush=True)
-    assert (outputs0[0].get_data_to_numpy() == outputs1[0].get_data_to_numpy()).all()
-
-
-if __name__ == '__main__':
-    print("test_kv_cache_mgr_plugin_custom_ops.py: begin run testcases.")
-    backend = sys.argv[1]
-    if backend == "Ascend":
-        inference_kv_cache_mgr()
-    else:
-        print(f'test_kv_cache_mgr_plugin_custom_ops.py: skip backend {backend}!')
-    print("test_kv_cache_mgr_plugin_custom_ops.py: run testcases success.")
-- 
Gitee