diff --git a/example/ccf_example/__pycache__/example_cudacode.cpython-310.pyc b/example/ccf_example/__pycache__/example_cudacode.cpython-310.pyc new file mode 100644 index 0000000..a6cfb34 Binary files /dev/null and b/example/ccf_example/__pycache__/example_cudacode.cpython-310.pyc differ diff --git a/example/ccf_example/__pycache__/example_torchcode.cpython-310.pyc b/example/ccf_example/__pycache__/example_torchcode.cpython-310.pyc new file mode 100644 index 0000000..d48fc38 Binary files /dev/null and b/example/ccf_example/__pycache__/example_torchcode.cpython-310.pyc differ diff --git a/example/ccf_example/example_cudacode.py b/example/ccf_example/example_cudacode.py new file mode 100644 index 0000000..c637ecc --- /dev/null +++ b/example/ccf_example/example_cudacode.py @@ -0,0 +1,44 @@ + +import torch +from torch.utils.cpp_extension import load_inline +relu_source = """ +#include +#include + +__global__ void relu_kernel(const float* x, float* y, int size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + y[idx] = fmaxf(x[idx], 0.f); + } +} + +torch::Tensor relu_cuda(torch::Tensor x) { + auto size = x.numel(); + auto y = torch::empty_like(x); + const int block_size = 256; + int num_blocks = (size + block_size - 1) / block_size; + relu_kernel<<>>(x.data_ptr(), y.data_ptr(), size); + return y; +} +""" + +relu_cpp_source = """ +torch::Tensor relu_cuda(torch::Tensor x); +""" + +# Compile the inline CUDA code +relu = load_inline( + name="relu", + cpp_sources=relu_cpp_source, + cuda_sources=relu_source, + functions=["relu_cuda"], + verbose=True +) + +class ModelNew(torch.nn.Module): + def __init__(self): + super(ModelNew, self).__init__() + self.relu = relu # The module containing the kernel + + def forward(self, x): + return self.relu.relu_cuda(x) \ No newline at end of file diff --git a/example/ccf_example/example_torchcode.py b/example/ccf_example/example_torchcode.py new file mode 100644 index 0000000..eb7aac8 --- /dev/null +++ b/example/ccf_example/example_torchcode.py @@ -0,0 +1,31 @@ +import torch +import torch.nn as nn + +class Model(nn.Module): + """ + Simple model that performs a ReLU activation. + """ + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Applies ReLU activation to the input tensor. + + Args: + x (torch.Tensor): Input tensor of any shape. + + Returns: + torch.Tensor: Output tensor with ReLU applied, same shape as input. + """ + return torch.relu(x) + +batch_size = 16 +dim = 16384 + +def get_inputs(): + x = torch.randn(batch_size, dim) + return [x] + +def get_init_inputs(): + return [] # No special initialization inputs needed \ No newline at end of file diff --git a/example/ccf_example/prompt.txt b/example/ccf_example/prompt.txt new file mode 100644 index 0000000..2fecbf3 --- /dev/null +++ b/example/ccf_example/prompt.txt @@ -0,0 +1,94 @@ +You write custom CUDA kernels to replace the pytorch operators in the given architecture to get speedups. + +You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CUDA kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination. + +Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is: + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Model(nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, a, b): + return a + b + + +def get_inputs(): + # randomly generate input tensors based on the model architecture + a = torch.randn(1, 128).cuda() + b = torch.randn(1, 128).cuda() + return [a, b] + + +def get_init_inputs(): + # randomly generate tensors required for initialization based on the model architecture + return [] +``` + +The example new arch with custom CUDA kernels looks like this: +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Model(nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, a, b): + return a + b + + +def get_inputs(): + # randomly generate input tensors based on the model architecture + a = torch.randn(1, 128).cuda() + b = torch.randn(1, 128).cuda() + return [a, b] + + +def get_init_inputs(): + # randomly generate tensors required for initialization based on the model architecture + return [] +``` + +You are given the following architecture: + +```python +import torch +import torch.nn as nn + +class Model(nn.Module): + """ + Simple model that performs a ReLU activation. + """ + def __init__(self): + super(Model, self).__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Applies ReLU activation to the input tensor. + + Args: + x (torch.Tensor): Input tensor of any shape. + + Returns: + torch.Tensor: Output tensor with ReLU applied, same shape as input. + """ + return torch.relu(x) + +batch_size = 16 +dim = 16384 + +def get_inputs(): + x = torch.randn(batch_size, dim) + return [x] + +def get_init_inputs(): + return [] # No special initialization inputs needed +``` \ No newline at end of file diff --git a/example/ccf_example/readme.md b/example/ccf_example/readme.md new file mode 100644 index 0000000..1447c24 --- /dev/null +++ b/example/ccf_example/readme.md @@ -0,0 +1,9 @@ +# 文件说明 + +example_torchcode.py:torch代码示例 + +example_cudacode.py:和torch对应的cuda代码 + +prompt.txt:利用LLM从torch代码生成cuda代码的prompt示例,(原始torch代码被附在prompt最后) + +run_code.py:用于测试生成的cuda代码和原始torch输出是否一致以及加速情况的示例代码 diff --git a/example/ccf_example/run_code.py b/example/ccf_example/run_code.py new file mode 100644 index 0000000..a18a7cd --- /dev/null +++ b/example/ccf_example/run_code.py @@ -0,0 +1,74 @@ +########################################################### +# 性能和精度验证程序 +########################################################### +import torch +import torch.nn as nn +import time +from example_torchcode import Model,get_inputs,get_init_inputs +from example_cudacode import ModelNew + +def run_benchmark(): + # 检查 CUDA 是否可用 + if not torch.cuda.is_available(): + print("CUDA 不可用,请确保您有可用的 NVIDIA GPU 并已正确安装 PyTorch CUDA 版本。") + return + else: + device = torch.device("cuda") + + # 初始化模型 + init_inputs = get_init_inputs() + init_inputs = [ + x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs + ] + inputs = get_inputs() + inputs = [ + x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs + ] + + torch_model = Model(*init_inputs).cuda() + cuda_model = ModelNew(*init_inputs).cuda() + + torch_model.eval() + cuda_model.eval() + + print("-------------------- 精度对齐验证 --------------------") + with torch.no_grad(): + output_torch = torch_model( *inputs) + output_cuda = cuda_model(*inputs) + + precision_flag = torch.allclose(output_torch, output_cuda,rtol=1e-03) + if precision_flag: + print("✅ 精度对齐:两个模型的输出结果非常接近。") + else: + print("❌ 精度不一致!") + + print("\n-------------------- 性能加速比测试 --------------------") + num_iterations = 100 + + # PyTorch 模型计时 + torch.cuda.synchronize() + start_time = time.time() + for _ in range(num_iterations): + _ = torch_model(*inputs) + torch.cuda.synchronize() + torch_time = (time.time() - start_time) / num_iterations + + # 自定义 CUDA 内核计时 + torch.cuda.synchronize() + start_time = time.time() + for _ in range(num_iterations): + _ = cuda_model(*inputs) + torch.cuda.synchronize() + cuda_time = (time.time() - start_time) / num_iterations + + print(f"PyTorch torch.relu 平均执行时间: {torch_time:.6f} 秒") + print(f"自定义 CUDA 内核 平均执行时间: {cuda_time:.6f} 秒") + speedup = 0 + if cuda_time > 0: + speedup = torch_time / cuda_time + print(f"加速比 (Speedup): {speedup:.2f}x") + else: + print("CUDA 内核执行时间为0,无法计算加速比。") + return precision_flag,speedup +if __name__ == "__main__": + precision_flag,speedup = run_benchmark() \ No newline at end of file diff --git a/example/ccf_example/参赛者需要提供的内容.md b/example/ccf_example/参赛者需要提供的内容.md new file mode 100644 index 0000000..cb75880 --- /dev/null +++ b/example/ccf_example/参赛者需要提供的内容.md @@ -0,0 +1,4 @@ +1. 根据example_torchcode.py的格式,提供一个torch实现的op,命名为torchcode.py +2. 仿照prompt.txt的写法,利用llm(deepseek、通义千问、GPT、Gemini等大模型)生成一个初始的cuda算子,按照example_cudacode.py的格式组织成一个可以运行的cuda op,命名为cudacode_ori.py,并且利用run_code.py 检查算子精度 +3. 在符合精度要求的cudacode_ori.py基础上,进行cuda算子性能优化,用run_code.py检查算子精度和加速比,形成最终的最优性能的cuda算子实现,命名为cudacode_opt.py,格式符合example_cudacode.py +4. 针对每一个op,参赛者需要提供四个文件,torchcode.py、prompt.txt、cudacode_ori.py、example_cudacode.py \ No newline at end of file