You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_code.py 3.0 kB

2 months ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. ###########################################################
  2. # 性能和精度验证程序
  3. ###########################################################
  4. import torch
  5. import torch.nn as nn
  6. import time
  7. from example_torchcode import Model, get_inputs, get_init_inputs
  8. from example_cudacode import ModelNew
  9. def run_benchmark():
  10. # 检查 CUDA 是否可用
  11. if not torch.cuda.is_available():
  12. print("CUDA 不可用,请确保您有可用的 NVIDIA GPU 并已正确安装 PyTorch CUDA 版本。")
  13. return
  14. else:
  15. device = torch.device("cuda")
  16. # 初始化模型
  17. init_inputs = get_init_inputs()
  18. init_inputs = [
  19. x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
  20. ]
  21. inputs = get_inputs()
  22. inputs = [
  23. x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs
  24. ]
  25. torch_model = Model(*init_inputs).cuda()
  26. cuda_model = ModelNew(*init_inputs).cuda()
  27. torch_model.eval()
  28. cuda_model.eval()
  29. print("-------------------- 精度对齐验证 --------------------")
  30. with torch.no_grad():
  31. output_torch = torch_model(*inputs)
  32. output_cuda = cuda_model(*inputs)
  33. # 更严格的精度检查
  34. abs_diff = (output_torch - output_cuda).abs()
  35. max_diff = abs_diff.max().item()
  36. mean_diff = abs_diff.mean().item()
  37. print(f"最大差异: {max_diff:.6f}")
  38. print(f"平均差异: {mean_diff:.6f}")
  39. precision_flag = torch.allclose(output_torch, output_cuda, rtol=1e-05, atol=1e-05)
  40. if precision_flag:
  41. print("✅ 精度对齐:两个模型的输出结果非常接近。")
  42. else:
  43. print("❌ 精度不一致!")
  44. print("\n-------------------- 性能加速比测试 --------------------")
  45. num_iterations = 1000 # 增加迭代次数以获得更准确的时间测量
  46. # Warm up
  47. for _ in range(100):
  48. _ = torch_model(*inputs)
  49. _ = cuda_model(*inputs)
  50. # PyTorch 模型计时
  51. torch.cuda.synchronize()
  52. start_time = time.time()
  53. for _ in range(num_iterations):
  54. _ = torch_model(*inputs)
  55. torch.cuda.synchronize()
  56. torch_time = (time.time() - start_time) / num_iterations
  57. # 自定义 CUDA 内核计时
  58. torch.cuda.synchronize()
  59. start_time = time.time()
  60. for _ in range(num_iterations):
  61. _ = cuda_model(*inputs)
  62. torch.cuda.synchronize()
  63. cuda_time = (time.time() - start_time) / num_iterations
  64. print(f"PyTorch (matmul + relu) 平均执行时间: {torch_time:.6f} 秒")
  65. print(f"自定义 CUDA ReLU 平均执行时间: {cuda_time:.6f} 秒")
  66. speedup = 0
  67. if cuda_time > 0:
  68. speedup = torch_time / cuda_time
  69. print(f"加速比 (Speedup): {speedup:.2f}x")
  70. else:
  71. print("CUDA 内核执行时间为0,无法计算加速比。")
  72. return precision_flag, speedup
  73. if __name__ == "__main__":
  74. precision_flag, speedup = run_benchmark()