########################################################### # 性能和精度验证程序 ########################################################### import torch import torch.nn as nn import time from example_torchcode import Model, get_inputs, get_init_inputs from example_cudacode import ModelNew def run_benchmark(): # 检查 CUDA 是否可用 if not torch.cuda.is_available(): print("CUDA 不可用,请确保您有可用的 NVIDIA GPU 并已正确安装 PyTorch CUDA 版本。") return else: device = torch.device("cuda") # 初始化模型 init_inputs = get_init_inputs() init_inputs = [ x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs ] inputs = get_inputs() inputs = [ x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs ] torch_model = Model(*init_inputs).cuda() cuda_model = ModelNew(*init_inputs).cuda() torch_model.eval() cuda_model.eval() print("-------------------- 精度对齐验证 --------------------") with torch.no_grad(): output_torch = torch_model(*inputs) output_cuda = cuda_model(*inputs) # 更严格的精度检查 abs_diff = (output_torch - output_cuda).abs() max_diff = abs_diff.max().item() mean_diff = abs_diff.mean().item() print(f"最大差异: {max_diff:.6f}") print(f"平均差异: {mean_diff:.6f}") precision_flag = torch.allclose(output_torch, output_cuda, rtol=1e-05, atol=1e-05) if precision_flag: print("✅ 精度对齐:两个模型的输出结果非常接近。") else: print("❌ 精度不一致!") print("\n-------------------- 性能加速比测试 --------------------") num_iterations = 1000 # 增加迭代次数以获得更准确的时间测量 # Warm up for _ in range(100): _ = torch_model(*inputs) _ = cuda_model(*inputs) # PyTorch 模型计时 torch.cuda.synchronize() start_time = time.time() for _ in range(num_iterations): _ = torch_model(*inputs) torch.cuda.synchronize() torch_time = (time.time() - start_time) / num_iterations # 自定义 CUDA 内核计时 torch.cuda.synchronize() start_time = time.time() for _ in range(num_iterations): _ = cuda_model(*inputs) torch.cuda.synchronize() cuda_time = (time.time() - start_time) / num_iterations print(f"PyTorch (matmul + relu) 平均执行时间: {torch_time:.6f} 秒") print(f"自定义 CUDA ReLU 平均执行时间: {cuda_time:.6f} 秒") speedup = 0 if cuda_time > 0: speedup = torch_time / cuda_time print(f"加速比 (Speedup): {speedup:.2f}x") else: print("CUDA 内核执行时间为0,无法计算加速比。") return precision_flag, speedup if __name__ == "__main__": precision_flag, speedup = run_benchmark()