diff --git a/S1/ICVXKH/__pycache__/example_cudacode.cpython-310.pyc b/S1/ICVXKH/__pycache__/example_cudacode.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d786c172788957527c1ff7387fbecd45edd20660 Binary files /dev/null and b/S1/ICVXKH/__pycache__/example_cudacode.cpython-310.pyc differ diff --git a/S1/ICVXKH/__pycache__/example_torchcode.cpython-310.pyc b/S1/ICVXKH/__pycache__/example_torchcode.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e8577182d55489b52d582c4eb7e749dcae4cc58 Binary files /dev/null and b/S1/ICVXKH/__pycache__/example_torchcode.cpython-310.pyc differ diff --git a/S1/ICVXKH/example_cudacode.py b/S1/ICVXKH/example_cudacode.py index 3232300efee7e8feeb1170f2ea03d12b1295c238..201fc7c2792a40eda9d9a3fb09e6d9f6d6357682 100644 --- a/S1/ICVXKH/example_cudacode.py +++ b/S1/ICVXKH/example_cudacode.py @@ -14,6 +14,7 @@ __global__ void relu_kernel(const float* x, float* y, int size) { } } + torch::Tensor relu_cuda(torch::Tensor x) { auto size = x.numel(); auto y = torch::empty_like(x); diff --git a/S1/ICVXKH/example_torchcode.py b/S1/ICVXKH/example_torchcode.py index 7e9d5f81ee131019d1d7b05c161a44ba01c2bf30..5a3a4c938cf011405bcb36edc2a39fb2c146441b 100644 --- a/S1/ICVXKH/example_torchcode.py +++ b/S1/ICVXKH/example_torchcode.py @@ -14,6 +14,8 @@ class Model(nn.Module): Performs matrix multiplication and applies ReLU activation. Args: + + x (torch.Tensor): Input tensor of shape [batch_size, input_dim] Returns: diff --git a/S1/ICVXKH/prompt.txt b/S1/ICVXKH/prompt.txt index 0deaedc34e51c38683026b099aefb08c820ea7f9..5f46ba98a7d7a53ce5831b8ce3f7344f35c99471 100644 --- a/S1/ICVXKH/prompt.txt +++ b/S1/ICVXKH/prompt.txt @@ -18,6 +18,7 @@ class Model(nn.Module): return a + b + def get_inputs(): # randomly generate input tensors based on the model architecture a = torch.randn(1, 128).cuda() diff --git a/S1/ICVXKH/readme.md b/S1/ICVXKH/readme.md index 1447c24c7d000100b76060b829fff2fcbb93515f..be728f63a61920ebc8782b99ab7faa630e3511d2 100644 --- a/S1/ICVXKH/readme.md +++ b/S1/ICVXKH/readme.md @@ -7,3 +7,4 @@ example_cudacode.py:和torch对应的cuda代码 prompt.txt:利用LLM从torch代码生成cuda代码的prompt示例,(原始torch代码被附在prompt最后) run_code.py:用于测试生成的cuda代码和原始torch输出是否一致以及加速情况的示例代码 + diff --git a/S1/ICVXKH/run_code.py b/S1/ICVXKH/run_code.py index 24e869475896c03b5ea8ad44742706f3182d9154..d1d4ef7e71184ba30e738dcd81500c788a2364ac 100644 --- a/S1/ICVXKH/run_code.py +++ b/S1/ICVXKH/run_code.py @@ -15,6 +15,7 @@ def run_benchmark(): else: device = torch.device("cuda") + # 初始化模型 init_inputs = get_init_inputs() init_inputs = [ diff --git "a/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" "b/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" index cb7588043339a54d759bed2514a78ca7014a9ef6..753dc87e5a95c2f30bd01a4946e28f61da0e18ac 100644 --- "a/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" +++ "b/S1/ICVXKH/\345\217\202\350\265\233\350\200\205\351\234\200\350\246\201\346\217\220\344\276\233\347\232\204\345\206\205\345\256\271.md" @@ -1,4 +1,4 @@ 1. 根据example_torchcode.py的格式,提供一个torch实现的op,命名为torchcode.py 2. 仿照prompt.txt的写法,利用llm(deepseek、通义千问、GPT、Gemini等大模型)生成一个初始的cuda算子,按照example_cudacode.py的格式组织成一个可以运行的cuda op,命名为cudacode_ori.py,并且利用run_code.py 检查算子精度 3. 在符合精度要求的cudacode_ori.py基础上,进行cuda算子性能优化,用run_code.py检查算子精度和加速比,形成最终的最优性能的cuda算子实现,命名为cudacode_opt.py,格式符合example_cudacode.py -4. 针对每一个op,参赛者需要提供四个文件,torchcode.py、prompt.txt、cudacode_ori.py、example_cudacode.py \ No newline at end of file +4. 针对每一个op,参赛者需要提供四个文件,torchcode.py、prompt.txt、cudacode_ori.py、example_cudacode.py