diff --git a/S1/wzz05_#1/mish_cudacode.py b/S1/wzz05_#1/mish_cudacode.py new file mode 100644 index 0000000000000000000000000000000000000000..56f1cde1bea8e92a4a974dd08fd806bc428549d1 --- /dev/null +++ b/S1/wzz05_#1/mish_cudacode.py @@ -0,0 +1,77 @@ +import torch +import torch.nn as nn +from torch.utils.cpp_extension import load_inline + +# Mish激活函数的CUDA实现 +mish_source = """ +#include +#include +#include + +// Mish函数实现 +__device__ __forceinline__ float mish_impl(float x) { + float softplus_val = (x > 20.0f) ? x : ((x < -20.0f) ? expf(x) : log1pf(expf(x))); + float tanh_val = tanhf(softplus_val); + return x * tanh_val; +} + +__global__ void mish_kernel( + const float* __restrict__ input, + float* __restrict__ output, + const int size +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) return; + + output[idx] = mish_impl(input[idx]); +} + +torch::Tensor mish_cuda(torch::Tensor input) { + TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); + TORCH_CHECK(input.dtype() == torch::kFloat32, "input must be float32"); + + input = input.contiguous(); + const int total_size = input.numel(); + auto output = torch::empty_like(input); + + const int threads_per_block = 256; + const int blocks = (total_size + threads_per_block - 1) / threads_per_block; + + mish_kernel<<>>( + input.data_ptr(), + output.data_ptr(), + total_size + ); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error("CUDA error: " + std::string(cudaGetErrorString(err))); + } + + return output; +} +""" + +mish_cpp_source = """ +torch::Tensor mish_cuda(torch::Tensor input); +""" + +mish_module = load_inline( + name="mish_final", + cpp_sources=mish_cpp_source, + cuda_sources=mish_source, + functions=["mish_cuda"], + extra_cuda_cflags=["-O2"], + verbose=False +) + +class ModelNew(nn.Module): + def __init__(self, in_features): + super(ModelNew, self).__init__() + torch.manual_seed(42) + self.linear = nn.Linear(in_features, in_features) + self.mish = mish_module.mish_cuda + + def forward(self, x): + x = self.linear(x) + return self.mish(x) diff --git a/S1/wzz05_#1/mish_torchcode.py b/S1/wzz05_#1/mish_torchcode.py new file mode 100644 index 0000000000000000000000000000000000000000..769504a11cdfb92d1ee56d7c6f9e8ea2a9d8f8b3 --- /dev/null +++ b/S1/wzz05_#1/mish_torchcode.py @@ -0,0 +1,22 @@ +import torch +import torch.nn as nn + +class Model(nn.Module): + def __init__(self, in_features): + super().__init__() + # 使用固定的随机种子确保可复现性 + torch.manual_seed(42) + self.linear = nn.Linear(in_features, in_features) + + def forward(self, x): + x = self.linear(x) + return x * torch.tanh(torch.nn.functional.softplus(x)) + +def get_inputs(): + batch_size = 4096 + in_features = 1024 + x = torch.randn(batch_size, in_features) + return [x] + +def get_init_inputs(): + return [1024] diff --git a/S1/wzz05_#1/prompt.txt b/S1/wzz05_#1/prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..c68a8af94a918116a8b1c84c6c8e34acdf8bcdcc --- /dev/null +++ b/S1/wzz05_#1/prompt.txt @@ -0,0 +1,183 @@ +You write custom CUDA kernels to replace the pytorch operators in the given architecture to get speedups. + +You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CUDA kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination. + +Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is: + +python +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): +def init(self) -> None: +super().init() + +def forward(self, a, b): + return a + b +def get_inputs(): +# randomly generate input tensors based on the model architecture +a = torch.randn(1, 128).cuda() +b = torch.randn(1, 128).cuda() +return [a, b] + +def get_init_inputs(): +# randomly generate tensors required for initialization based on the model architecture +return [] + + + +The example new arch with custom CUDA kernels looks like this: +python +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): +def init(self) -> None: +super().init() + +def forward(self, a, b): + return a + b +def get_inputs(): +# randomly generate input tensors based on the model architecture +a = torch.randn(1, 128).cuda() +b = torch.randn(1, 128).cuda() +return [a, b] + +def get_init_inputs(): +# randomly generate tensors required for initialization based on the model architecture +return [] + + + +You are given the following architecture: + +python +import torch +import torch.nn as nn + +class Model(nn.Module): +“”" +Model with Linear layer followed by Mish activation. +“”" +def init(self, in_features: int): +super(Model, self).init() +self.linear = nn.Linear(in_features, in_features) + +def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Applies Linear transformation followed by Mish activation. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, in_features). + + Returns: + torch.Tensor: Output tensor with same shape as input. + """ + x = self.linear(x) + return x * torch.tanh(torch.nn.functional.softplus(x)) +batch_size = 4096 +in_features = 1024 + +def get_inputs(): +x = torch.randn(batch_size, in_features) +return [x] + +def get_init_inputs(): +return [1024] # in_features parameter for initialization + + +example_torchcode.py + + +import torch +import torch.nn as nn + +class Model(nn.Module): + """ + Model with Linear layer followed by Mish activation. + """ + def __init__(self, in_features: int): + super(Model, self).__init__() + self.linear = nn.Linear(in_features, in_features) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Applies Linear transformation followed by Mish activation. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, in_features). + + Returns: + torch.Tensor: Output tensor with same shape as input. + """ + x = self.linear(x) + return x * torch.tanh(torch.nn.functional.softplus(x)) + +batch_size = 4096 +in_features = 1024 + +def get_inputs(): + x = torch.randn(batch_size, in_features) + return [x] + +def get_init_inputs(): + return [1024] # in_features parameter for initialization +example_cudacode.py + + +import torch +from torch.utils.cpp_extension import load_inline + +mish_source = """ +#include +#include +#include + +__device__ __forceinline__ float mish_impl(float x) { + float softplus_val = (x > 20.0f) ? x : ((x < -20.0f) ? expf(x) : log1pf(expf(x))); + float tanh_val = tanhf(softplus_val); + return x * tanh_val; +} + +__global__ void mish_kernel(const float* x, float* y, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + y[idx] = mish_impl(x[idx]); + } +} + +torch::Tensor mish_cuda(torch::Tensor x) { + auto size = x.numel(); + auto y = torch::empty_like(x); + const int block_size = 256; + int num_blocks = (size + block_size - 1) / block_size; + mish_kernel<<>>(x.data_ptr(), y.data_ptr(), size); + return y; +} +""" + +mish_cpp_source = """ +torch::Tensor mish_cuda(torch::Tensor x); +""" + +# Compile the inline CUDA code +mish = load_inline( + name="mish", + cpp_sources=mish_cpp_source, + cuda_sources=mish_source, + functions=["mish_cuda"], + verbose=True +) + +class ModelNew(torch.nn.Module): + def __init__(self, in_features: int): + super(ModelNew, self).__init__() + torch.manual_seed(42) # Ensure reproducibility + self.linear = torch.nn.Linear(in_features, in_features) + self.mish = mish # The module containing the kernel + + def forward(self, x): + x = self.linear(x) + return self.mish.mish_cuda(x) \ No newline at end of file diff --git a/S1/wzz05_#1/run_code.py b/S1/wzz05_#1/run_code.py new file mode 100644 index 0000000000000000000000000000000000000000..fe35be7cb18f55c1c621879429db78cd6336708f --- /dev/null +++ b/S1/wzz05_#1/run_code.py @@ -0,0 +1,74 @@ +########################################################### +# 性能和精度验证程序 +########################################################### +import torch +import torch.nn as nn +import time +from mish_torchcode import Model,get_inputs,get_init_inputs +from mish_cudacode import ModelNew + +def run_benchmark(): + # 检查 CUDA 是否可用 + if not torch.cuda.is_available(): + print("CUDA 不可用,请确保您有可用的 NVIDIA GPU 并已正确安装 PyTorch CUDA 版本。") + return + else: + device = torch.device("cuda") + + # 初始化模型 + init_inputs = get_init_inputs() + init_inputs = [ + x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs + ] + inputs = get_inputs() + inputs = [ + x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs + ] + + torch_model = Model(*init_inputs).cuda() + cuda_model = ModelNew(*init_inputs).cuda() + + torch_model.eval() + cuda_model.eval() + + print("-------------------- 精度对齐验证 --------------------") + with torch.no_grad(): + output_torch = torch_model( *inputs) + output_cuda = cuda_model(*inputs) + + precision_flag = torch.allclose(output_torch, output_cuda,rtol=1e-03) + if precision_flag: + print("✅ 精度对齐:两个模型的输出结果非常接近。") + else: + print("❌ 精度不一致!") + + print("\n-------------------- 性能加速比测试 --------------------") + num_iterations = 100 + + # PyTorch 模型计时 + torch.cuda.synchronize() + start_time = time.time() + for _ in range(num_iterations): + _ = torch_model(*inputs) + torch.cuda.synchronize() + torch_time = (time.time() - start_time) / num_iterations + + # 自定义 CUDA 内核计时 + torch.cuda.synchronize() + start_time = time.time() + for _ in range(num_iterations): + _ = cuda_model(*inputs) + torch.cuda.synchronize() + cuda_time = (time.time() - start_time) / num_iterations + + print(f"PyTorch torch.relu 平均执行时间: {torch_time:.6f} 秒") + print(f"自定义 CUDA 内核 平均执行时间: {cuda_time:.6f} 秒") + speedup = 0 + if cuda_time > 0: + speedup = torch_time / cuda_time + print(f"加速比 (Speedup): {speedup:.2f}x") + else: + print("CUDA 内核执行时间为0,无法计算加速比。") + return precision_flag,speedup +if __name__ == "__main__": + precision_flag,speedup = run_benchmark() \ No newline at end of file