From f835bc89c7f02bbc7b4be1d53079e2337d3359e3 Mon Sep 17 00:00:00 2001 From: yefeng Date: Sun, 19 Oct 2025 18:11:31 +0800 Subject: [PATCH] del code --- .../st/python/akg_custom_ops/ascend_akg.ini | 14 - .../akg_custom_ops/test_paged_attention.py | 326 ---------------- .../test_paged_attention_mask.py | 354 ------------------ .../akg_custom_ops/test_reshape_and_cache.py | 232 ------------ .../optimize_pass/test_concat_op_pass.py | 95 ----- .../optimize_pass/test_padv3_ge_pass.py | 123 ------ .../plugin_custom_ops/test_kv_cache_mgr.py | 166 -------- 7 files changed, 1310 deletions(-) delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py delete mode 100644 mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py delete mode 100644 mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py delete mode 100644 mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py delete mode 100644 mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py diff --git a/mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini b/mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini deleted file mode 100644 index 8d1ed76a..00000000 --- a/mindspore-lite/test/st/python/akg_custom_ops/ascend_akg.ini +++ /dev/null @@ -1,14 +0,0 @@ -[ascend_context] -provider=ge - -[ge_session_options] -ge.externalWeight=1 -ge.exec.atomicCleanPolicy=1 -ge.event=notify -ge.exec.formatMode=1 -ge.exec.precision_mode=must_keep_origin_dtype - -[graph_kernel_param] -opt_level=2 -enable_cce_lib=true -enable_cluster_ops_only=ReshapeAndCache,PagedAttention,PagedAttentionMask \ No newline at end of file diff --git a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py b/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py deleted file mode 100644 index 712d87b1..00000000 --- a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention.py +++ /dev/null @@ -1,326 +0,0 @@ -# pylint: disable=C0330, C0326 -# -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Test PagedAttention plugin custom ops. -""" -import os -import math -import random -import logging -import numpy as np -import mindspore_lite as mslite -from mindspore import nn -from mindspore import Tensor, context, export -from mindspore.ops.auto_generate.gen_ops_prim import PagedAttention - -MAX_SEQ_LEN = 1024 - - -class PagedAttentionNet(nn.Cell): - """ - A single op network of PagedAttention. - """ - - def __init__(self, mp=None, strategy=None): - super().__init__() - self.n_head_no_use = 40 - self.head_dim_no_use = 128 - self.scale_value_no_use = 1 / math.sqrt(self.head_dim_no_use) - self.n_kv_head_no_use = 40 - self.paged_attention = PagedAttention( - self.n_head_no_use, self.scale_value_no_use, self.n_kv_head_no_use - ) - if strategy is not None: - self.paged_attention.shard(strategy) - elif mp is not None: - strategy = ((1, mp, 1), (1, 1, mp, 1), (1, 1, mp, 1), (1, 1), (1,)) - self.paged_attention.shard(strategy) - - def construct(self, query, key_cache, value_cache, block_tables, context_lens): - return self.paged_attention( - query, key_cache, value_cache, block_tables, context_lens - ) - - -def export_model() -> str: - """ - export model with fixed shape. - """ - context.set_context(mode=context.GRAPH_MODE, device_target="CPU") - - num_tokens = 2 - num_head = 32 - head_dim = 128 - kv_head = 16 - num_blocks = 64 - block_size = 128 - max_num_blocks_per_batch = 8 - - q = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16)) - key_cache = Tensor( - np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16) - ) - value_cache = Tensor( - np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16) - ) - block_tables = Tensor( - np.ones((num_tokens, max_num_blocks_per_batch), dtype=np.int32) - ) - context_len = Tensor(np.ones((num_tokens,), dtype=np.int32)) - - file_name = "paged_attention" - net = PagedAttentionNet() - export( - net, - q, - key_cache, - value_cache, - block_tables, - context_len, - file_name=file_name, - file_format="MINDIR", - ) - model_name = file_name + ".mindir" - assert os.path.exists(model_name) - return model_name - - -def group_matmul(head, kv_head, a, b): - """ - Calculte a group(for all heads) of MatMul. - """ - group_num = head // kv_head - score = None - for i in range(kv_head): - group_score = np.matmul( - a[i * group_num : (i + 1) * group_num, :, :].astype(np.float32), - b[i : (i + 1), :, :].astype(np.float32), - ).astype(np.float16) - if score is None: - score = group_score - else: - score = np.concatenate((score, group_score), 0) - print(score.shape) - return score - - -def ref_masked_attention( - query, # (1, num_heads, head_size) - key, # (context_len, kv_heads, head_size) - value, - scale: float, -): - """ - Implement masked attention with numpy. - """ - # Q * K.T - query = query * scale - query = np.transpose(query, (1, 0, 2)) # 转置-> num_head, seqlen, head_size - key = np.transpose(key, (1, 2, 0)) # 转置 -> kv_heads, head_size, context_len - sim = group_matmul(query.shape[0], key.shape[0], query, key) - # softmax - row_max = np.max(sim, axis=-1, keepdims=True) - sim -= row_max - sim = sim.astype("float32") - sim = np.exp(sim) - row_sum = np.sum(sim, axis=-1, keepdims=True) - p = sim / row_sum - p = p.astype("float16") - # P * V - value = np.transpose(value, (1, 0, 2)) # 转置-> kv_heads, seqlen, head_size - out = group_matmul(query.shape[0], key.shape[0], p, value) - out = np.transpose(out, (1, 0, 2)) # 转置-> seqlen, num_head, head_size - return out - - -def ref_single_query_cached_kv_attention(output, paged_input) -> None: - """ - Implement single query attention with numpy. - """ - query, key_cache, value_cache, block_tables, context_lens = paged_input - num_heads = query.shape[1] - kv_heads = value_cache.shape[2] - head_size = value_cache.shape[3] - block_size = value_cache.shape[1] - - num_input_tokens = query.shape[0] - for i in range(num_input_tokens): - q = np.expand_dims(query[i], 0) - block_table = block_tables[i] - context_len = int(context_lens[i]) - - # 读取不同content_len的key和value,拼接在一起。 - keys = [] - values = [] - for j in range(context_len): # 单个序列总的block个数 - block_number = int(block_table[j // block_size]) - block_offset = j % block_size - - k = key_cache[block_number, block_offset, :, :] - k = k.reshape(kv_heads, head_size) - keys.append(k) # 读取key的内容 - - v = value_cache[block_number, block_offset, :, :] - v = v.reshape(kv_heads, head_size) - values.append(v) # 读取value的内容 - keys = np.stack(np.array(keys), axis=0) - values = np.stack(np.array(values), axis=0) - print( - f"query.shape: {q.shape}, {q.dtype}, keys.shape: {keys.shape}, " - f"context_len: {context_len}, keyblocknum: {(context_len + block_size - 1) // block_size}, " - f"tail: {context_len % block_size}" - ) - scale = 1.0 / (head_size**0.5) # 1/sqrt(d) - - out = ref_masked_attention(q, keys, values, scale) # 计算attention - - out = out.reshape(num_heads, head_size) # 2D输出 - output[i] = out - - -def create_golden_data(num_tokens=2, kv_heads=16, block_size=128, num_blocks=64): - """ - Create golden data for PagedAttention op. - """ - num_heads = 32 - head_size = 128 - dtype = "float16" - query = np.random.uniform( - -1.0, 1.0, size=(num_tokens, num_heads, head_size) - ).astype(dtype) - - # key value cache: (num_blocks, block_size, num_heads, head_size) - key_cache = np.random.uniform( - -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size) - ).astype(dtype) - value_cache = np.random.uniform( - -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size) - ).astype(dtype) - - context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_tokens)] - # context_lens = [1024] * num_tokens # 每个batch对应的seqlen - _ = [ - print(f"context_len: {x} % {block_size} == 1") - for x in context_lens - if x % block_size == 1 - ] - max_context_len = max(context_lens) - - max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size - block_tables = [] # (num_tokens, max_num_blocks_per_seq) - for i in range(num_tokens): - n_block = (context_lens[i] + block_size - 1) // block_size - print(f"n_block {i} = {n_block}") - n_pad_block = max_num_blocks_per_seq - n_block - block_table = [ - random.randint(0, num_blocks - 1) - for _ in range(n_block) # 给方块里面的每一个Block都分配了显存 - ] - if n_pad_block != 0: - block_table = block_table + ([-1] * n_pad_block) - print(f"block table {i} = {block_table}") - block_tables.append(block_table) - - context_lens = np.array(context_lens).astype(np.int32) - block_tables = np.array(block_tables).astype(np.int32) - - paged_input = [query, key_cache, value_cache, block_tables, context_lens] - ref_output = np.zeros_like(query) - - # 计算输出 - ref_single_query_cached_kv_attention(ref_output, paged_input) - - print(f"==> query shape: {query.shape}, data: \n{query}") - print(f"==> key_cache shape: {key_cache.shape}") - print(f"==> value_cache shape: {value_cache.shape}") - print(f"==> block_tables shape: {block_tables.shape}, data: \n{block_tables}") - print(f"==> context_lens shape: {context_lens.shape}, data: \n{context_lens}") - print("data generate finished!") - ref_outputs = [ref_output] - return paged_input, ref_outputs - - -def do_mslite_infer(model_file, in_tensors): - """ - Do model inference with mslite. - """ - print(model_file) - lite_context = mslite.Context() - lite_context.target = ["ascend"] - lite_context.ascend.device_id = 2 - lite_context.ascend.provider = "ge" - lite_context.ascend.rank_id = 0 - model = mslite.Model() - - script_dir = os.path.dirname(__file__) - config_path = os.path.join(script_dir, "ascend_akg.ini") - print(f"Use config file: {config_path}") - model.build_from_file( - model_file, mslite.ModelType.MINDIR, lite_context, config_path=config_path - ) - - outputs = model.predict(in_tensors) - np_output: list[np.ndarray] = [] - for output in outputs: - np_output.append(output.get_data_to_numpy()) - print("outputs' shape: ", np_output[-1].shape) - print("finish------------------") - return np_output - - -def inference_model(mindir_model: str): - """ - Inference model. - """ - inputs, ref_outputs = create_golden_data() - - # 运行昇腾算子 - in_tensors = [mslite.Tensor(x) for x in inputs] - ascend_outputs = do_mslite_infer(mindir_model, in_tensors) - - for i, ascend_output in enumerate(ascend_outputs): - is_close = np.allclose(ref_outputs[i], ascend_output, rtol=1e-3, atol=1e-03) - logging.info("ref_outputs %d:\n%s", i, ref_outputs[i]) - logging.info("ascend_outputs %d:\n%s", i, ascend_output) - logging.info("ascend output %d is equal to ref output: %s", i, is_close) - assert is_close - - -def test_paged_attention_fixed_shape(): - """ - Test PagedAttention of fixed shape. - """ - model_path = export_model() - print(f"paged_attention_fixed_shape st : export success to path: {model_path}") - logging.info( - "paged_attention_fixed_shape st : export success to path: {%s}", model_path - ) - - model_path = "paged_attention.mindir" - - inference_model(model_path) - print("paged_attention_fixed_shape st : inference success.") - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s", - filename="./test.log", - filemode="w", - ) - test_paged_attention_fixed_shape() diff --git a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py b/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py deleted file mode 100644 index d7468887..00000000 --- a/mindspore-lite/test/st/python/akg_custom_ops/test_paged_attention_mask.py +++ /dev/null @@ -1,354 +0,0 @@ -# pylint: disable=C0330, C0326 -# -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Test PagedAttentionMask plugin custom ops. -""" -import os -import math -import random -import logging -import numpy as np -import mindspore_lite as mslite -from mindspore import nn -from mindspore import Tensor, context, export -from mindspore.ops.auto_generate.gen_ops_prim import PagedAttentionMask - -MAX_SEQ_LEN = 1024 - - -class PagedAttentionMaskNet(nn.Cell): - """ - A single op network of PagedAttentionMask. - """ - - def __init__(self): - super().__init__() - self.n_head_no_use = 40 - self.head_dim_no_use = 128 - self.scale_value_no_use = 1 / math.sqrt(self.head_dim_no_use) - self.n_kv_head_no_use = 40 - self.paged_attention_mask = PagedAttentionMask( - self.n_head_no_use, self.scale_value_no_use, self.n_kv_head_no_use - ) - - def construct( - self, query, key_cache, value_cache, block_tables, context_lens, alibi_mask - ): - return self.paged_attention_mask( - query, key_cache, value_cache, block_tables, context_lens, alibi_mask - ) - - -def export_model() -> str: - """ - Export model with fixed shape. - """ - context.set_context(mode=context.GRAPH_MODE, device_target="CPU") - - num_tokens = 2 - num_head = 32 - head_dim = 128 - kv_head = 16 - num_blocks = 64 - block_size = 128 - max_num_blocks_per_batch = 8 - - q = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16)) - key_cache = Tensor( - np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16) - ) - value_cache = Tensor( - np.ones((num_blocks, block_size, kv_head, head_dim), dtype=np.float16) - ) - block_tables = Tensor( - np.ones((num_tokens, max_num_blocks_per_batch), dtype=np.int32) - ) - context_len = Tensor(np.ones((num_tokens,), dtype=np.int32)) - alibi_mask = Tensor( - np.ones((num_tokens, num_head, 1, max_num_blocks_per_batch), dtype=np.float16) - ) - - file_name = "paged_attention_mask" - net = PagedAttentionMaskNet() - export( - net, - q, - key_cache, - value_cache, - block_tables, - context_len, - alibi_mask, - file_name=file_name, - file_format="MINDIR", - ) - model_name = file_name + ".mindir" - assert os.path.exists(model_name) - return model_name - - -def group_matmul(head, kv_head, a, b): - """ - Calculte a group(for all heads) of MatMul. - """ - group_num = head // kv_head - score = None - for i in range(kv_head): - group_score = np.matmul( - a[i * group_num : (i + 1) * group_num, :, :].astype(np.float32), - b[i : (i + 1), :, :].astype(np.float32), - ).astype(np.float16) - if score is None: - score = group_score - else: - score = np.concatenate((score, group_score), 0) - print(score.shape) - return score - - -def ref_masked_attention( - query, # (1, num_heads, head_size) - key, # (context_len, kv_heads, head_size) - value, - scale: float, - alibi_bias, -): - """ - Implement masked attention with numpy. - """ - # Q * K.T - query = query * scale - query = np.transpose(query, (1, 0, 2)) # 转置-> num_head, seqlen, head_size - key = np.transpose(key, (1, 2, 0)) # 转置 -> kv_heads, head_size, context_len - sim = group_matmul(query.shape[0], key.shape[0], query, key) - sim = sim + alibi_bias - - # softmax - row_max = np.max(sim, axis=-1, keepdims=True) - sim -= row_max - sim = sim.astype("float32") - sim = np.exp(sim) - row_sum = np.sum(sim, axis=-1, keepdims=True) - p = sim / row_sum - p = p.astype("float16") - # P * V - value = np.transpose(value, (1, 0, 2)) # 转置-> kv_heads, seqlen, head_size - out = group_matmul(query.shape[0], key.shape[0], p, value) - out = np.transpose(out, (1, 0, 2)) # 转置-> seqlen, num_head, head_size - return out - - -def ref_single_query_cached_kv_attention(output, paged_input) -> None: - """ - Implement single query attention with numpy. - """ - query, key_cache, value_cache, block_tables, context_lens, alibi_mask = paged_input - num_heads = query.shape[1] - kv_heads = value_cache.shape[2] - head_size = value_cache.shape[3] - block_size = value_cache.shape[1] - - num_input_tokens = query.shape[0] - for i in range(num_input_tokens): - q = np.expand_dims(query[i], 0) - block_table = block_tables[i] - context_len = int(context_lens[i]) - - # 读取不同content_len的key和value,拼接在一起。 - keys = [] - values = [] - for j in range(context_len): # 单个序列总的block个数 - block_number = int(block_table[j // block_size]) - block_offset = j % block_size - - k = key_cache[block_number, block_offset, :, :] - k = k.reshape(kv_heads, head_size) - keys.append(k) # 读取key的内容 - - v = value_cache[block_number, block_offset, :, :] - v = v.reshape(kv_heads, head_size) - values.append(v) # 读取value的内容 - keys = np.stack(np.array(keys), axis=0) - values = np.stack(np.array(values), axis=0) - print( - f"query.shape: {q.shape}, {q.dtype}, keys.shape: {keys.shape}, " - f"context_len: {context_len}, keyblocknum: {(context_len + block_size - 1) // block_size}, " - f"tail: {context_len % block_size}, alibi_bias.shape: {alibi_mask[i].shape}" - ) - scale = 1.0 / (head_size**0.5) # 1/sqrt(d) - - out = ref_masked_attention( - q, keys, values, scale, alibi_mask[i, :, :, :context_len] - ) # 计算attention - - out = out.reshape(num_heads, head_size) # 2D输出 - output[i] = out - - -def create_golden_data(num_tokens=2, kv_heads=16, block_size=128, num_blocks=64): - """ - Create golden data for PagedAttentionMask op. - """ - num_heads = 32 - head_size = 128 - dtype = "float16" - query = np.random.uniform( - -1.0, 1.0, size=(num_tokens, num_heads, head_size) - ).astype(dtype) - - # key value cache: (num_blocks, block_size, num_heads, head_size) - key_cache = np.random.uniform( - -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size) - ).astype(dtype) - value_cache = np.random.uniform( - -1.0, 1.0, size=(num_blocks, block_size, kv_heads, head_size) - ).astype(dtype) - - context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_tokens)] - # context_lens = [1024] * num_tokens # 每个batch对应的seqlen - _ = [ - print(f"context_len: {x} % {block_size} == 1") - for x in context_lens - if x % block_size == 1 - ] - max_context_len = max(context_lens) - - max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size - block_tables = [] # (num_tokens, max_num_blocks_per_seq) - for i in range(num_tokens): - n_block = (context_lens[i] + block_size - 1) // block_size - print(f"n_block {i} = {n_block}") - n_pad_block = max_num_blocks_per_seq - n_block - block_table = [ - random.randint(0, num_blocks - 1) - for _ in range(n_block) # 给方块里面的每一个Block都分配了显存 - ] - if n_pad_block != 0: - block_table = block_table + ([-1] * n_pad_block) - print(f"block table {i} = {block_table}") - block_tables.append(block_table) - - context_lens = np.array(context_lens).astype(np.int32) - block_tables = np.array(block_tables).astype(np.int32) - - # alibi mask - alibi_slopes = np.random.random(num_heads).astype(np.float16) - alibi_mask = np.zeros((num_tokens, num_heads, 1, max_context_len), dtype=np.float16) - for i, context_len in enumerate(context_lens): - position_ids = np.arange(context_len).astype(np.int32) - alibi_bias = (position_ids - context_len + 1).astype( - np.float16 - ) # -context_len+1, -context_len+2,..,0 - alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape( - 1, 1, -1 - ) # (head_num, 1, context) - alibi_mask[i, :, :, :context_len] = alibi_bias - print(f"alibi_mask.shape = {alibi_mask.shape}") - - paged_input = [ - query, - key_cache, - value_cache, - block_tables, - context_lens, - alibi_mask, - ] - ref_output = np.zeros_like(query) - - # 计算输出 - ref_single_query_cached_kv_attention(ref_output, paged_input) - - print(f"==> query shape: {query.shape}, data: \n{query}") - print(f"==> key_cache shape: {key_cache.shape}") - print(f"==> value_cache shape: {value_cache.shape}") - print(f"==> block_tables shape: {block_tables.shape}, data: \n{block_tables}") - print(f"==> context_lens shape: {context_lens.shape}, data: \n{context_lens}") - print(f"==> alibi_mask shape: {alibi_mask.shape}, data: \n{alibi_mask}") - print("data generate done!") - ref_outputs = [ref_output] - return paged_input, ref_outputs - - -def do_mslite_infer(model_file, in_tensors): - """ - Do model inference with mslite. - """ - print(model_file) - lite_context = mslite.Context() - lite_context.target = ["ascend"] - lite_context.ascend.device_id = 2 - lite_context.ascend.provider = "ge" - lite_context.ascend.rank_id = 0 - model = mslite.Model() - - script_dir = os.path.dirname(__file__) - config_path = os.path.join(script_dir, "ascend_akg.ini") - print(f"Use config file: {config_path}") - model.build_from_file( - model_file, mslite.ModelType.MINDIR, lite_context, config_path=config_path - ) - - outputs = model.predict(in_tensors) - np_output: list[np.ndarray] = [] - for output in outputs: - np_output.append(output.get_data_to_numpy()) - print("outputs' shape: ", np_output[-1].shape) - print("finish------------------") - return np_output - - -def inference_model(mindir_model: str): - """ - Inference model. - """ - inputs, ref_outputs = create_golden_data() - - # 运行昇腾算子 - in_tensors = [mslite.Tensor(x) for x in inputs] - ascend_outputs = do_mslite_infer(mindir_model, in_tensors) - - for i, ascend_output in enumerate(ascend_outputs): - is_close = np.allclose(ref_outputs[i], ascend_output, rtol=1e-3, atol=1e-03) - logging.info("ref_outputs %d:\n%s", i, ref_outputs[i]) - logging.info("ascend_outputs %d:\n%s", i, ascend_output) - logging.info("ascend output %d is equal to ref output: %s", i, is_close) - assert is_close - - -def test_paged_attention_mask_fixed_shape(): - """ - Test PagedAttentionMask of fixed shape. - """ - model_path = export_model() - print(f"paged_attention_fixed_shape st : export success to path: {model_path}") - logging.info( - "paged_attention_mask_fixed_shape st : export success to path: %s", model_path - ) - - model_path = "paged_attention_mask.mindir" - - inference_model(model_path) - print("paged_attention_mask_fixed_shape st : inference success.") - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s", - filename="./test.log", - filemode="w", - ) - test_paged_attention_mask_fixed_shape() diff --git a/mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py b/mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py deleted file mode 100644 index b52e13a9..00000000 --- a/mindspore-lite/test/st/python/akg_custom_ops/test_reshape_and_cache.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Test ReshapeAndCache plugin custom ops. -""" -import os -import logging -from typing import List -import numpy as np -import mindspore_lite as mslite -from mindspore import nn, ops -from mindspore import Tensor, Parameter, context, export -from mindspore.ops.auto_generate.gen_ops_prim import ReshapeAndCache - - -class ReshapeAndCacheNet(nn.Cell): - """ - ReshapeAndCacheNet. - """ - - def __init__(self, num_blocks, block_size, kv_head, head_dim): - super().__init__() - self.key_cache = Parameter( - np.zeros((num_blocks, block_size, kv_head, head_dim), dtype=np.float16), - name="key_cache", - ) - self.value_cache = Parameter( - np.zeros((num_blocks, block_size, kv_head, head_dim), dtype=np.float16), - name="value_cache", - ) - self.reshape_and_cache = ReshapeAndCache() - self.depends = ops.Depend() - - def construct(self, key, value, slot_mapping): - out_key = self.reshape_and_cache( - key, value, self.key_cache, self.value_cache, slot_mapping - ) - out_key_cache = self.depends(self.key_cache, out_key) - out_value_cache = self.depends(self.value_cache, out_key) - return out_key, out_key_cache, out_value_cache - - -def export_model() -> str: - """ - Export model with fixed shape. - """ - context.set_context(mode=context.GRAPH_MODE, device_target="CPU") - - num_tokens = 512 - num_head = 40 - head_dim = 128 - block_size = 16 - num_blocks = 128 - - key = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16)) - value = Tensor(np.ones((num_tokens, num_head, head_dim), dtype=np.float16)) - slot_mapping = Tensor(np.ones((num_tokens,), dtype=np.int32)) - - file_name = "reshape_and_cache" - net = ReshapeAndCacheNet(num_blocks, block_size, num_head, head_dim) - export(net, key, value, slot_mapping, file_name=file_name, file_format="MINDIR") - model_name = file_name + ".mindir" - assert os.path.exists(model_name) - return model_name - - -def ref_reshape_and_cache(key, value, key_cache, value_cache, slot_mapping): - """ - Implement reshape_and_cache with numpy. - """ - _, block_size, _, _ = key_cache.shape - key_out = np.zeros_like(key) - for i, slot_idx in enumerate(slot_mapping): - if slot_idx == -1: # skip special pad slot index -1. - continue - block_index = slot_idx // block_size - block_offset = slot_idx % block_size - key_cache[block_index, block_offset, :, :] = key[i, :, :] - value_cache[block_index, block_offset, :, :] = value[i, :, :] - key_out[i, :, :] = key[i, :, :] - return (key_out, key_cache, value_cache) - - -def create_input(cache_shape: List[int], update_shape: List[int], with_pad_slot: bool): - """ - Create numpy input data for ReshapeAndCache op. - """ - key = np.random.rand(*update_shape).astype(np.float16) - value = np.random.rand(*update_shape).astype(np.float16) - - key_cache = np.zeros(cache_shape).astype(np.float16) - value_cache = np.zeros(cache_shape).astype(np.float16) - - num_blocks = cache_shape[0] - block_size = cache_shape[1] - total_num_slots = num_blocks * block_size - - num_tokens = update_shape[0] - if with_pad_slot: - # construct a slot mapping case like: [x, ..., z, -1, ..., -1] - num_valid_token = num_tokens // 2 - num_pad_token = num_tokens - num_valid_token - slot_mapping = np.random.choice( - np.arange(0, total_num_slots), size=num_valid_token, replace=False - ).astype(np.int32) - pad_slots = np.array([-1 for _ in range(num_pad_token)], dtype=np.int32) - slot_mapping = np.concatenate((slot_mapping, pad_slots), axis=0) - else: - slot_mapping = np.random.choice( - np.arange(0, total_num_slots), size=num_tokens, replace=False - ).astype(np.int32) - - return key, value, key_cache, value_cache, slot_mapping - - -def create_golden_data(with_pad_slot: bool): - """ - Create golden data for ReshapeAndCache op. - """ - num_tokens = 512 - num_blocks = 128 - block_size = 16 - num_head = 40 - head_dim = 128 - - cache_shape = [num_blocks, block_size, num_head, head_dim] - update_shape = [num_tokens, num_head, head_dim] - key, value, key_cache, value_cache, slot_mapping = create_input( - cache_shape, update_shape, with_pad_slot - ) - logging.info("slot_mapping shape: %s, data:\n%s", slot_mapping.shape, slot_mapping) - - # generate golden output with numpy op implement. - key_golden, key_cahce_gloden, value_cache_golden = ref_reshape_and_cache( - key, value, key_cache, value_cache, slot_mapping - ) - - inputs = [key, value, slot_mapping] - ref_outputs = [key_golden, key_cahce_gloden, value_cache_golden] - - return inputs, ref_outputs - - -def do_mslite_infer(model_file, in_tensors): - """ - Do model inference with mslite. - """ - lite_context = mslite.Context() - lite_context.target = ["ascend"] - lite_context.ascend.device_id = 2 - lite_context.ascend.provider = "ge" - lite_context.ascend.rank_id = 0 - model = mslite.Model() - - script_dir = os.path.dirname(__file__) - config_path = os.path.join(script_dir, "ascend_akg.ini") - print(f"Use config file: {config_path}") - model.build_from_file( - model_file, mslite.ModelType.MINDIR, lite_context, config_path=config_path - ) - - out_tensors = model.predict(in_tensors) - np_output = [tensor.get_data_to_numpy() for tensor in out_tensors] - return np_output - - -def inference_model(mindir_model: str, with_pad_slot: bool): - """ - Inference model - """ - inputs, ref_outputs = create_golden_data(with_pad_slot) - # 运行昇腾算子 - in_tensors = [mslite.Tensor(x) for x in inputs] - ascend_outputs = do_mslite_infer(mindir_model, in_tensors) - - for i, ascend_output in enumerate(ascend_outputs): - is_close = np.allclose(ref_outputs[i], ascend_output, rtol=1e-3, atol=1e-03) - logging.info("ref_outputs %d:\n%s", i, ref_outputs[i]) - logging.info("ascend_outputs %d:\n%s", i, ascend_output) - logging.info("ascend output %d is equal to ref output: %s", i, is_close) - assert is_close - - -def test_reshape_and_cache_fixed_shape(): - """ - Test ReshapAndCache of fixed shape. - """ - model_path = export_model() - print(f"reshape_and_cache_dynamic_shape st : export success to path: {model_path}") - logging.info( - "reshape_and_cache_dynamic_shape st : export success to path: %s", model_path - ) - - inference_model(model_path, with_pad_slot=False) - print("reshape_and_cache_dynamic_shape st : inference success.") - - -def test_reshape_and_cache_skip_pad_slot(): - """ - Test ReshapAndCache with skipping index: -1. - """ - model_path = export_model() - print(f"reshape_and_cache_skip_pad_slot st : export success to path: {model_path}") - logging.info( - "reshape_and_cache_skip_pad_slot st : export success to path: %s", model_path - ) - - inference_model(model_path, with_pad_slot=True) - print("reshape_and_cache_skip_pad_slot st : inference success.") - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s", - filename="./test.log", - filemode="w", - ) - test_reshape_and_cache_fixed_shape() - test_reshape_and_cache_skip_pad_slot() diff --git a/mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py b/mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py deleted file mode 100644 index b8229dea..00000000 --- a/mindspore-lite/test/st/python/optimize_pass/test_concat_op_pass.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Test concat op pass. -""" -import os -import sys -import time -import numpy as np -import mindspore_lite as mslite -import mindspore as ms -from mindspore import Tensor, ops, nn -import mindspore.common.dtype as mstype -from mindspore import context - - -class ConcatOpPassNet(nn.Cell): - """ - KVCacheMgrNet. - """ - def __init__(self): - super().__init__() - self.pad = ops.PadV3() - self.concat = ops.Concat(axis=0) - - def construct(self, key): - pad_length = key.astype(mstype.int64) - key_paddings = self.concat((Tensor([0, 0, 0, 0, 0], mstype.int64), pad_length, Tensor([0, 0], mstype.int64))) - return key_paddings - - -def dummy_tensor(shape, dtype): - """create dummy tensor""" - if None in shape: - return Tensor(shape=shape, dtype=dtype) - return Tensor(np.ones(shape=tuple(shape)), dtype=dtype) - - -def export_model(): - """ - export model - """ - in_key = dummy_tensor(shape=[None], dtype=ms.int64) - context.set_context(mode=context.GRAPH_MODE, device_target="CPU") - - net = ConcatOpPassNet() - file_name = "concat_op_pass" - ms.export(net, in_key, file_name=file_name, file_format="MINDIR") - model_name = file_name + ".mindir" - assert os.path.exists(model_name) - return model_name - - -def inference(): - """ - def inference_concat_op_pass - """ - time_start_total = time.time() - model_path = export_model() - - lite_ctx0 = mslite.Context() - lite_ctx0.target = ["ascend"] - lite_ctx0.ascend.device_id = 0 - lite_ctx0.ascend.provider = "ge" - model = mslite.Model() - model.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx0) - # warm up - np_data = np.ones((1), np.int64) - outputs = model.predict([mslite.Tensor(np_data)]) - result = np.array([0, 0, 0, 0, 0, 1, 0, 0]) - os.remove(model_path) - print(f"predict cost total {(time.time() - time_start_total)*1000} ms", flush=True) - assert (outputs[0].get_data_to_numpy() == result).all() - - -if __name__ == '__main__': - print("test_concat_op_pass.py: begin run testcases.") - backend = sys.argv[1] - if backend == "Ascend": - inference() - else: - print(f'test_concat_op_pass.py: skip backend {backend}!') - print("test_concat_op_pass.py: run testcases success.") diff --git a/mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py b/mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py deleted file mode 100644 index d3b9b097..00000000 --- a/mindspore-lite/test/st/python/optimize_pass/test_padv3_ge_pass.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Test concat op pass. -""" -import os -import sys -import time -import numpy as np -import mindspore_lite as mslite -import mindspore as ms -from mindspore import Tensor, ops, nn -import mindspore.common.dtype as mstype -from mindspore import context - - -class PadV3GePassNet(nn.Cell): - """ - PadV3GePassNet. - """ - - def __init__(self): - super().__init__() - self.pad = ops.PadV3() - self.concat = ops.Concat(axis=0) - self.sub = ops.Sub() - self.up_to_len = 10 - - def construct(self, key): - pad_length = ( - self.sub(ms.Tensor(self.up_to_len, mstype.int32), ops.dyn_shape(key)[-2]) - .reshape((1,)) - .astype(mstype.int32) - ) - key_paddings = self.concat( - ( - Tensor([0, 0, 0], mstype.int32), - pad_length, - Tensor([0, 0, 0, 0], mstype.int32), - ) - ) - key_present = self.pad(key, key_paddings, Tensor(0, mstype.float16)) - return key_present - - -def dummy_tensor(shape, dtype): - """create dummy tensor""" - if None in shape: - return Tensor(shape=shape, dtype=dtype) - return Tensor(np.ones(shape=tuple(shape)), dtype=dtype) - - -def export_model(): - """ - export model - """ - in_key = dummy_tensor(shape=[1, 2, None, 2], dtype=ms.float16) - context.set_context(mode=context.GRAPH_MODE, device_target="CPU") - - net = PadV3GePassNet() - file_name = "padv3_ge_pass" - ms.export(net, in_key, file_name=file_name, file_format="MINDIR") - model_name = file_name + ".mindir" - assert os.path.exists(model_name) - return model_name - - -def inference_common(): - """ - def inference_padv3_ge_pass - """ - time_start_total = time.time() - model_path = export_model() - - lite_ctx0 = mslite.Context() - lite_ctx0.target = ["ascend"] - lite_ctx0.ascend.device_id = 0 - lite_ctx0.ascend.provider = "ge" - model = mslite.Model() - model.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx0) - - # input data - np_data = np.zeros((1, 2, 4, 2), np.float16) - # change the first 2 elements of the third dimension to 1 - np_data[:, :, :2, :] = 1.0 - outputs = model.predict([mslite.Tensor(np_data)]) - result = np.zeros((1, 2, 10, 2), np.float16) - # change the first 2 elements of the third dimension to 1 - result[:, :, :2, :] = 1.0 - os.remove(model_path) - print(f"predict cost total {(time.time() - time_start_total)*1000} ms", flush=True) - print("gold shape:") - print(result.shape) - print(result) - print("model output shape:") - np_out = outputs[0].get_data_to_numpy() - print(np_out.shape) - print(np_out) - - assert np_out.shape == result.shape - assert (np_out == result).all() - - -if __name__ == "__main__": - print("test_padv3_ge_pass.py: begin run testcases.") - backend = sys.argv[1] - if backend == "Ascend": - inference_common() - else: - print(f"test_padv3_ge_pass.py: skip backend {backend}!") - print("test_padv3_ge_pass.py: run testcases success.") diff --git a/mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py b/mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py deleted file mode 100644 index 80a52a8f..00000000 --- a/mindspore-lite/test/st/python/plugin_custom_ops/test_kv_cache_mgr.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2023 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Test KVCacheMgr plugin custom ops. -""" -import os -import sys -import time -import numpy as np -import mindspore_lite as mslite -import mindspore as ms -import mindspore.nn as nn -from mindspore.ops import operations as P -from mindspore.ops import functional as F -from mindspore.train.serialization import export -from mindspore import Tensor -from mindspore import context - - -class KVCacheMgrNet(nn.Cell): - """ - KVCacheMgrNet. - """ - def __init__(self, batch_size, src_seq_length): - super().__init__() - self.mul = P.Mul() - self.add = P.Add() - self.tile = P.Tile() - self.expand_dims = P.ExpandDims() - self.dtype = ms.float16 - - seq_range = np.arange(src_seq_length).reshape(1, 1, -1) - self.range = Tensor(np.tile(seq_range, (batch_size, 1, 1)), ms.int32) - self.equal = P.Equal() - self.sub = P.Sub() - - def construct(self, key_past, key, value_past, value, batch_valid_length): - current_index = F.reshape(batch_valid_length, (-1, 1, 1)) - current_mask = F.cast(self.equal(self.range, current_index), self.dtype) - # Pad the key and value to seq_length with only the position index not zero - current_key = self.mul(key, self.expand_dims(current_mask, 3)) - current_value = self.mul(value, self.expand_dims(current_mask, 3)) - # Concat the previous saved state and current state - key = self.add(key_past, current_key) - value = self.add(value_past, current_value) - - ans = self.sub(key, value) - return ans - - -def create_shapes(): - batch_size = 1 - num_head = 40 - seq_length = 1024 - update_seq_length = 1 - size_pre_head = 128 - past_shape = (batch_size, num_head, seq_length, size_pre_head) - cur_shape = (batch_size, num_head, update_seq_length, size_pre_head) - return past_shape, cur_shape - - -def create_inputs(): - """ - create inputs. - """ - past_shape, cur_shape = create_shapes() - - key_past = Tensor(np.random.rand(*past_shape), ms.float16) - key_cur = Tensor(np.random.rand(*cur_shape), ms.float16) - value_past = Tensor(np.random.rand(*past_shape), ms.float16) - value_cur = Tensor(np.random.rand(*cur_shape), ms.float16) - index = Tensor(shape=(1,), dtype=ms.int32, init=1) - return (key_past, key_cur, value_past, value_cur, index) - - -def create_lite_inputs(): - """ - create lite inputs. - """ - past_shape, cur_shape = create_shapes() - - key_past = mslite.Tensor(np.zeros(past_shape, np.float16)) - key_cur = mslite.Tensor(np.random.rand(*cur_shape).astype(np.float16)) - value_past = mslite.Tensor(np.zeros(past_shape, np.float16)) - value_cur = mslite.Tensor(np.random.rand(*cur_shape).astype(np.float16)) - index = mslite.Tensor(np.ones(1).astype(np.int32)) - return (key_past, key_cur, value_past, value_cur, index) - - -def export_model(): - """ - export model - """ - context.set_context(mode=context.GRAPH_MODE, device_target="CPU") - key_past, key_cur, value_past, value_cur, index = create_inputs() - batch_size = key_past.shape[0] - src_seq_length = key_past.shape[-2] - - net = KVCacheMgrNet(batch_size, src_seq_length) - file_name = "kv_cache_mgr_net" - - export(net, key_past, key_cur, value_past, value_cur, - index, file_name=file_name, file_format='MINDIR') - model_name = file_name + ".mindir" - assert os.path.exists(model_name) - return model_name - - -def inference_kv_cache_mgr(): - """ - def inference_kv_cache_mgr - """ - time_start_total = time.time() - model_path = export_model() - input_lists = list(create_lite_inputs()) - - lite_ctx0 = mslite.Context() - lite_ctx0.target = ["ascend"] - lite_ctx0.ascend.device_id = 0 - lite_ctx0.ascend.provider = "ge" - model0 = mslite.Model() - model0.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx0) - # warm up - outputs0 = model0.predict(input_lists) - time_start = time.time() - outputs0 = model0.predict(input_lists) - print(f"predict plugin_custom_ops=None cost {(time.time() - time_start)*1000} ms", flush=True) - - lite_ctx1 = mslite.Context() - lite_ctx1.target = ["ascend"] - lite_ctx1.ascend.device_id = 0 - lite_ctx1.ascend.provider = "ge" - dict1 = {"ascend_context": {"plugin_custom_ops": "All"}} - model1 = mslite.Model() - model1.build_from_file(model_path, mslite.ModelType.MINDIR, lite_ctx1, "", dict1) - # warm up - outputs1 = model1.predict(input_lists) - time_start = time.time() - outputs1 = model1.predict(input_lists) - print(f"predict plugin_custom_ops=All cost {(time.time() - time_start)*1000} ms", flush=True) - - os.remove(model_path) - print(f"predict cost total {(time.time() - time_start_total)*1000} ms", flush=True) - assert (outputs0[0].get_data_to_numpy() == outputs1[0].get_data_to_numpy()).all() - - -if __name__ == '__main__': - print("test_kv_cache_mgr_plugin_custom_ops.py: begin run testcases.") - backend = sys.argv[1] - if backend == "Ascend": - inference_kv_cache_mgr() - else: - print(f'test_kv_cache_mgr_plugin_custom_ops.py: skip backend {backend}!') - print("test_kv_cache_mgr_plugin_custom_ops.py: run testcases success.") -- Gitee