diff --git a/aikg/python/ai_kernel_generator/config/default_swft_config.yaml b/aikg/python/ai_kernel_generator/config/default_swft_config.yaml index 2c5ce3c554e726f2a8fd9eb9d6719eb56c0a85a9..d10ad1b8af11350a81e41ccf5f34b54edc2aa98f 100644 --- a/aikg/python/ai_kernel_generator/config/default_swft_config.yaml +++ b/aikg/python/ai_kernel_generator/config/default_swft_config.yaml @@ -1,12 +1,12 @@ # Model preset configuration agent_model_config: - designer: deepseek_r1_default - coder: deepseek_r1_default - conductor: deepseek_r1_default - api_generator: deepseek_r1_default - example_compressor: deepseek_r1_default - feature_extractor: deepseek_r1_default - default: deepseek_r1_default + designer: vllm_deepseek_v31_default + coder: vllm_deepseek_v31_default + conductor: vllm_deepseek_v31_default + api_generator: vllm_deepseek_v31_default + example_compressor: vllm_deepseek_v31_default + feature_extractor: vllm_deepseek_v31_default + default: vllm_deepseek_v31_default # Log configuration log_dir: "~/aikg_logs" diff --git a/aikg/python/ai_kernel_generator/resources/templates/kernel_verify_template.j2 b/aikg/python/ai_kernel_generator/resources/templates/kernel_verify_template.j2 index 82a3d10e5f6c0d28f9ce9055b8e1223857d1a6cd..590762aa59886daa38a00704b8435b897823e776 100644 --- a/aikg/python/ai_kernel_generator/resources/templates/kernel_verify_template.j2 +++ b/aikg/python/ai_kernel_generator/resources/templates/kernel_verify_template.j2 @@ -20,6 +20,9 @@ from {{ op_name }}_numpy import get_inputs_dyn_list {% else %} from {{ op_name }}_numpy import get_inputs {% endif %} +from swft.core import * +from swft.api import * +from swft.runtime import * TensorType = np.ndarray {% elif framework == "mindspore" %} import mindspore as ms @@ -125,68 +128,6 @@ def load_tensor(bin_path: str, expect_tensor: TensorType) -> TensorType: return ms.Tensor(numpy_tensor, dtype=expect_tensor.dtype) {% endif %} -def gen_binary_data(inputs, outputs, data_dir): - """生成二进制数据文件 - - Args: - inputs: 输入张量列表 - outputs: 输出张量列表或单个张量 - data_dir: 数据保存目录 - """ - os.makedirs(data_dir, exist_ok=True) - - # 创建输入输出目录 - input_dir = os.path.join(data_dir, "{{ op_name }}", "input") - output_dir = os.path.join(data_dir, "{{ op_name }}", "output") - os.makedirs(input_dir, exist_ok=True) - os.makedirs(output_dir, exist_ok=True) - - # 保存输入数据 - for i, input_tensor in enumerate(inputs): - if isinstance(input_tensor, TensorType): - bin_path = os.path.join(input_dir, f"input{i}.bin") - save_tensor(input_tensor, bin_path) - - # 处理输出数据 - if not isinstance(outputs, (list, tuple)): - outputs = [outputs] # 将单个张量转换为列表 - - # 保存golden输出 - for i, output_tensor in enumerate(outputs): - if isinstance(output_tensor, TensorType): - golden_path = os.path.join(output_dir, f"output{i}_golden.bin") - save_tensor(output_tensor, golden_path) - -def load_binary_data(data_dir, reference_outputs): - """加载二进制数据文件并转换为张量 - - Args: - data_dir: 数据目录 - reference_outputs: 参考输出张量列表或单个张量,用于确定数据类型和形状 - - Returns: - 加载的张量列表 - """ - if not isinstance(reference_outputs, (list, tuple)): - reference_outputs = [reference_outputs] - - output_dir = os.path.join(data_dir, "{{ op_name }}", "output") - loaded_outputs = [] - i = 0 - while True: - output_path = os.path.join(output_dir, f"output{i}_actual.bin") - if not os.path.exists(output_path): - break - if i >= len(reference_outputs): - raise RuntimeError(f"输出文件数量({i+1})超过参考输出数量({len(reference_outputs)})") - loaded_outputs.append(load_tensor(output_path, reference_outputs[i])) - i += 1 - - if not loaded_outputs: - raise RuntimeError("未找到任何输出文件, 一般是因为输入数据类型和原任务的输入数据类型不匹配") - - return loaded_outputs - {% if "triton" in dsl and backend == "cuda" and arch == "a100" %} def get_limit(data_type): import torch @@ -374,12 +315,17 @@ def verify_implementations(): except (AttributeError, TypeError): return x {% else %} - return x + if isinstance(x, np.ndarray): + tensor = Tensor(x, multi_core=True) + tensor.sync_host_to_device() + return tensor + else: + return x {% endif %} def verify_single_case(inputs): """验证单个案例的公共逻辑""" - {% if backend == "ascend" %} + {% if backend == "ascend" and framework == "torch" %} torch.npu.manual_seed(0) {% endif %} @@ -387,17 +333,12 @@ def verify_implementations(): framework_output = framework_model(*inputs) {% if dsl == "swft" %} - # 运行SWFT实现 - data_dir = os.path.dirname(__file__) - - # 生成二进制数据文件 - gen_binary_data(inputs, framework_output, data_dir) - - # 运行SWFT实现 - {{ impl_func_name }}(device_id=int({{ device_id }})) - - # 加载SWFT输出 - impl_output = load_binary_data(data_dir, framework_output) + npu_session = NPUSession.create(device_id={{ device_id }}, context="310P") + # 将输入转为swft.core.Tensor + inputs = [process_input(x) for x in inputs] + impl_output = {{ impl_func_name }}(*inputs) + npu_session.sync_stream() + impl_output.sync_device_to_host() {% elif dsl in ["triton", "cuda_c", "cpp"] %} # 运行Triton实现 impl_output = {{ impl_func_name }}(*inputs) @@ -428,6 +369,7 @@ def verify_implementations(): impl_out_flatten = impl_out_flatten.detach().cpu().numpy() {% elif framework == "numpy" %} framework_out_flatten = fw_out.flatten() + impl_out = impl_out.as_numpy() impl_out_flatten = impl_out.flatten() {% elif framework == "mindspore" %} framework_out_flatten = fw_out.flatten().asnumpy() diff --git a/aikg/tests/resources/tanh_op/tanh_numpy.py b/aikg/tests/resources/tanh_op/tanh_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..44fb5372452080849852e47f714322630fba335a --- /dev/null +++ b/aikg/tests/resources/tanh_op/tanh_numpy.py @@ -0,0 +1,16 @@ +import numpy as np +class Model: + def __init__(self): + super(Model, self).__init__() + + def __call__(self, x: np.ndarray) -> np.ndarray: + return np.tanh(x) + +batch_size = 16 +dim = 16384 + +def get_inputs(): + return [np.random.randn(batch_size, dim).astype(np.float16)] + +def get_init_inputs(): + return [] \ No newline at end of file diff --git a/aikg/tests/resources/tanh_op/tanh_swft.py b/aikg/tests/resources/tanh_op/tanh_swft.py new file mode 100644 index 0000000000000000000000000000000000000000..8647542a7942495edb9cfb81f07b0367c80029fa --- /dev/null +++ b/aikg/tests/resources/tanh_op/tanh_swft.py @@ -0,0 +1,17 @@ +import numpy as np +from swft.core import * +from swft.api import * +from swft.runtime import * + +@native_jit(core_num=8) +def tanh_kernel(x, out): + x_ub = move_to_ub(x) + tanh_ub = tanh(x_ub) + out.load(tanh_ub) + +def tanh_swft_numpy(x_np): + out = Tensor("GM", "FP16", x_np.shape, "ND", False) + x_np.sync_host_to_device() + tanh_kernel(x_np, out) + return out + diff --git a/aikg/tests/st/test_kernel_verifier.py b/aikg/tests/st/test_kernel_verifier.py index 91c06d9dd797b58834db6aba1c239c37dba07f3b..4431b08070b2b7a0fa9735993cde22607a38d716 100644 --- a/aikg/tests/st/test_kernel_verifier.py +++ b/aikg/tests/st/test_kernel_verifier.py @@ -22,6 +22,45 @@ from ai_kernel_generator.config.config_validator import load_config device_id = os.getenv("DEVICE_ID", 1) +@pytest.mark.level0 +@pytest.mark.numpy +@pytest.mark.swft +@pytest.mark.ascend +@pytest.mark.ascend310p3 +@pytest.mark.parametrize("op_name", ["tanh"]) +def test_kernel_verifier_ascend310p3_swft(op_name): + framework = "numpy" + dsl = "swft" + backend = "ascend" + arch = "ascend310p3" + config = load_config(dsl) # unused + # 读取框架实现代码 + op_task_file = f"./tests/resources/{op_name}_op/{op_name}_{framework}.py" + with open(op_task_file, "r", encoding="utf-8") as f: + op_task_str = textwrap.dedent(f.read()) + + # 读取实现代码 + kernel_path = f"./tests/resources/{op_name}_op/{op_name}_{dsl}.py" + with open(kernel_path, "r", encoding="utf-8") as f: + kernel_code = f.read() + + log_dir = create_log_dir(f'{op_name}_{framework}_{backend}_{arch}_{dsl}_test') + impl_func_name = f"{op_name}_{dsl}_{framework}" + verifier = KernelVerifier( + op_name=op_name, + framework_code=op_task_str, + framework=framework, + dsl=dsl, + backend=backend, + arch=arch, + impl_func_name=impl_func_name, + config=config + ) + task_info = {} + task_info["coder_code"] = kernel_code + result, error_log = verifier.run(task_info, device_id=device_id) + assert result, f"验证失败: {error_log}" + @pytest.mark.level0 @pytest.mark.mindspore @pytest.mark.triton @@ -61,7 +100,6 @@ def test_kernel_verifier_ascend910b4_mindspore(op_name): result, error_log = verifier.run(task_info, device_id=device_id) assert result, f"验证失败: {error_log}" - @pytest.mark.level0 @pytest.mark.torch @pytest.mark.triton