diff --git a/S1/ICVXKH/example_cudacode.py b/S1/ICVXKH/example_cudacode.py index 3232300efee7e8feeb1170f2ea03d12b1295c238..f95d0786c2bf5ed2ce522ac7d523acfc208fa2e2 100644 --- a/S1/ICVXKH/example_cudacode.py +++ b/S1/ICVXKH/example_cudacode.py @@ -11,7 +11,7 @@ __global__ void relu_kernel(const float* x, float* y, int size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { y[idx] = fmaxf(x[idx], 0.f); - } + } } torch::Tensor relu_cuda(torch::Tensor x) { diff --git a/S1/ICVXKH/example_torchcode.py b/S1/ICVXKH/example_torchcode.py index 7e9d5f81ee131019d1d7b05c161a44ba01c2bf30..1322b740e5c0f66ac68660b475636eadd89d1e2c 100644 --- a/S1/ICVXKH/example_torchcode.py +++ b/S1/ICVXKH/example_torchcode.py @@ -11,7 +11,7 @@ class Model(nn.Module): def forward(self, x: torch.Tensor) -> torch.Tensor: """ - Performs matrix multiplication and applies ReLU activation. + Performs matrix multiplication and applies ReLU activation. Args: x (torch.Tensor): Input tensor of shape [batch_size, input_dim] diff --git a/S1/ICVXKH/prompt.txt b/S1/ICVXKH/prompt.txt index 0deaedc34e51c38683026b099aefb08c820ea7f9..4da4fbf20c0e67bd352791afba159c6a0daf2370 100644 --- a/S1/ICVXKH/prompt.txt +++ b/S1/ICVXKH/prompt.txt @@ -9,7 +9,7 @@ import torch import torch.nn as nn import torch.nn.functional as F - + class Model(nn.Module): def __init__(self) -> None: super().__init__() diff --git a/S1/ICVXKH/readme.md b/S1/ICVXKH/readme.md index 1447c24c7d000100b76060b829fff2fcbb93515f..787662d6e1f0d88b9339b3d11a568a5e5688579d 100644 --- a/S1/ICVXKH/readme.md +++ b/S1/ICVXKH/readme.md @@ -6,4 +6,4 @@ example_cudacode.py:和torch对应的cuda代码 prompt.txt:利用LLM从torch代码生成cuda代码的prompt示例,(原始torch代码被附在prompt最后) -run_code.py:用于测试生成的cuda代码和原始torch输出是否一致以及加速情况的示例代码 +run_code.py:用于测试生成的cuda代码和原始torch输出是否一致以及加速情况的示例代码 diff --git a/S1/ICVXKH/run_code.py b/S1/ICVXKH/run_code.py index 24e869475896c03b5ea8ad44742706f3182d9154..54ce4b25943a3ce0378ff7ca45ac1081471e4a3c 100644 --- a/S1/ICVXKH/run_code.py +++ b/S1/ICVXKH/run_code.py @@ -9,7 +9,7 @@ from example_cudacode import ModelNew def run_benchmark(): # 检查 CUDA 是否可用 - if not torch.cuda.is_available(): + if not torch.cuda.is_available(): print("CUDA 不可用,请确保您有可用的 NVIDIA GPU 并已正确安装 PyTorch CUDA 版本。") return else: diff --git "a/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" "b/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" index cb7588043339a54d759bed2514a78ca7014a9ef6..72586d94ef3537b27a0a0f977c20f4406a8e0775 100644 --- "a/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" +++ "b/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" @@ -1,4 +1,4 @@ 1. 根据example_torchcode.py的格式,提供一个torch实现的op,命名为torchcode.py 2. 仿照prompt.txt的写法,利用llm(deepseek、通义千问、GPT、Gemini等大模型)生成一个初始的cuda算子,按照example_cudacode.py的格式组织成一个可以运行的cuda op,命名为cudacode_ori.py,并且利用run_code.py 检查算子精度 3. 在符合精度要求的cudacode_ori.py基础上,进行cuda算子性能优化,用run_code.py检查算子精度和加速比,形成最终的最优性能的cuda算子实现,命名为cudacode_opt.py,格式符合example_cudacode.py -4. 针对每一个op,参赛者需要提供四个文件,torchcode.py、prompt.txt、cudacode_ori.py、example_cudacode.py \ No newline at end of file +4. 针对每一个op,参赛者需要提供四个文件,torchcode.py、prompt.txt、cudacode_ori.py、example_cudacode.py \ No newline at end of file