diff --git a/test001/cudacode_ori.py b/test001/cudacode_ori.py
new file mode 100644
index 0000000000000000000000000000000000000000..6323e5fb35a6ee2799339ecb7b3480c86bab746d
--- /dev/null
+++ b/test001/cudacode_ori.py
@@ -0,0 +1,44 @@
+
+import torch
+from torch.utils.cpp_extension import load_inline
+relu_source = """
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+
+__global__ void relu_kernel(const float* x, float* y, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        y[idx] = fmaxf(x[idx], 0.f);
+    }
+}
+
+torch::Tensor relu_cuda(torch::Tensor x) {
+    auto size = x.numel();
+    auto y = torch::empty_like(x);
+    const int block_size = 256;
+    int num_blocks = (size + block_size - 1) / block_size;
+    relu_kernel<<<num_blocks, block_size>>>(x.data_ptr<float>(), y.data_ptr<float>(), size);
+    return y; 
+}
+"""
+
+relu_cpp_source = """
+torch::Tensor relu_cuda(torch::Tensor x);
+"""
+
+# Compile the inline CUDA code
+relu = load_inline(
+    name="relu",
+    cpp_sources=relu_cpp_source,
+    cuda_sources=relu_source,
+    functions=["relu_cuda"],
+    verbose=True
+)
+
+class ModelNew(torch.nn.Module):
+    def __init__(self):
+        super(ModelNew, self).__init__()
+        self.relu = relu  # The module containing the kernel
+
+    def forward(self, x):
+        return self.relu.relu_cuda(x)
\ No newline at end of file
diff --git a/test001/example_cudacode.py b/test001/example_cudacode.py
new file mode 100644
index 0000000000000000000000000000000000000000..c637ecc09a40eff6d76b6740ae7f244fb468b921
--- /dev/null
+++ b/test001/example_cudacode.py
@@ -0,0 +1,44 @@
+
+import torch
+from torch.utils.cpp_extension import load_inline
+relu_source = """
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+
+__global__ void relu_kernel(const float* x, float* y, int size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        y[idx] = fmaxf(x[idx], 0.f);
+    }
+}
+
+torch::Tensor relu_cuda(torch::Tensor x) {
+    auto size = x.numel();
+    auto y = torch::empty_like(x);
+    const int block_size = 256;
+    int num_blocks = (size + block_size - 1) / block_size;
+    relu_kernel<<<num_blocks, block_size>>>(x.data_ptr<float>(), y.data_ptr<float>(), size);
+    return y;
+}
+"""
+
+relu_cpp_source = """
+torch::Tensor relu_cuda(torch::Tensor x);
+"""
+
+# Compile the inline CUDA code
+relu = load_inline(
+    name="relu",
+    cpp_sources=relu_cpp_source,
+    cuda_sources=relu_source,
+    functions=["relu_cuda"],
+    verbose=True
+)
+
+class ModelNew(torch.nn.Module):
+    def __init__(self):
+        super(ModelNew, self).__init__()
+        self.relu = relu  # The module containing the kernel
+
+    def forward(self, x):
+        return self.relu.relu_cuda(x)
\ No newline at end of file
diff --git a/test001/prompt.txt b/test001/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2fecbf34c1b72fe0136c356bd9154ab2dde865dd
--- /dev/null
+++ b/test001/prompt.txt
@@ -0,0 +1,94 @@
+You write custom CUDA kernels to replace the pytorch operators in the given architecture to get speedups.   
+  
+You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CUDA kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.  
+  
+Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is:   
+  
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        return a + b
+
+
+def get_inputs():
+    # randomly generate input tensors based on the model architecture
+    a = torch.randn(1, 128).cuda()
+    b = torch.randn(1, 128).cuda()
+    return [a, b]
+
+
+def get_init_inputs():
+    # randomly generate tensors required for initialization based on the model architecture
+    return []
+```
+  
+The example new arch with custom CUDA kernels looks like this:   
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        return a + b
+
+
+def get_inputs():
+    # randomly generate input tensors based on the model architecture
+    a = torch.randn(1, 128).cuda()
+    b = torch.randn(1, 128).cuda()
+    return [a, b]
+
+
+def get_init_inputs():
+    # randomly generate tensors required for initialization based on the model architecture
+    return []
+```
+  
+You are given the following architecture:   
+  
+```python  
+import torch  
+import torch.nn as nn  
+  
+class Model(nn.Module):  
+    """  
+    Simple model that performs a ReLU activation.  
+    """  
+    def __init__(self):  
+        super(Model, self).__init__()  
+      
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  
+        """  
+        Applies ReLU activation to the input tensor.  
+  
+        Args:  
+            x (torch.Tensor): Input tensor of any shape.  
+  
+        Returns:  
+            torch.Tensor: Output tensor with ReLU applied, same shape as input.  
+        """  
+        return torch.relu(x)  
+  
+batch_size = 16  
+dim = 16384  
+  
+def get_inputs():  
+    x = torch.randn(batch_size, dim)  
+    return [x]  
+  
+def get_init_inputs():  
+    return []  # No special initialization inputs needed  
+```
\ No newline at end of file
diff --git a/test001/torchcode.py b/test001/torchcode.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb7aac8ba8bcdf35c9422a7fabb3ca6e8d6df091
--- /dev/null
+++ b/test001/torchcode.py
@@ -0,0 +1,31 @@
+import torch
+import torch.nn as nn
+
+class Model(nn.Module):
+    """
+    Simple model that performs a ReLU activation.
+    """
+    def __init__(self):
+        super(Model, self).__init__()
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies ReLU activation to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of any shape.
+
+        Returns:
+            torch.Tensor: Output tensor with ReLU applied, same shape as input.
+        """
+        return torch.relu(x)
+
+batch_size = 16
+dim = 16384
+
+def get_inputs():
+    x = torch.randn(batch_size, dim)
+    return [x]
+
+def get_init_inputs():
+    return []  # No special initialization inputs needed
\ No newline at end of file