You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

run_code.py 2.5 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. ###########################################################
  2. # 性能和精度验证程序
  3. ###########################################################
  4. import torch
  5. import torch.nn as nn
  6. import time
  7. from example_torchcode import Model,get_inputs,get_init_inputs
  8. from example_cudacode import ModelNew
  9. def run_benchmark():
  10. # 检查 CUDA 是否可用
  11. if not torch.cuda.is_available():
  12. print("CUDA 不可用,请确保您有可用的 NVIDIA GPU 并已正确安装 PyTorch CUDA 版本。")
  13. return
  14. else:
  15. device = torch.device("cuda")
  16. # 初始化模型
  17. init_inputs = get_init_inputs()
  18. init_inputs = [
  19. x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
  20. ]
  21. inputs = get_inputs()
  22. inputs = [
  23. x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs
  24. ]
  25. torch_model = Model(*init_inputs).cuda()
  26. cuda_model = ModelNew(*init_inputs).cuda()
  27. torch_model.eval()
  28. cuda_model.eval()
  29. print("-------------------- 精度对齐验证 --------------------")
  30. with torch.no_grad():
  31. output_torch = torch_model( *inputs)
  32. output_cuda = cuda_model(*inputs)
  33. precision_flag = torch.allclose(output_torch, output_cuda,rtol=1e-03)
  34. if precision_flag:
  35. print("✅ 精度对齐:两个模型的输出结果非常接近。")
  36. else:
  37. print("❌ 精度不一致!")
  38. print("\n-------------------- 性能加速比测试 --------------------")
  39. num_iterations = 100
  40. # PyTorch 模型计时
  41. torch.cuda.synchronize()
  42. start_time = time.time()
  43. for _ in range(num_iterations):
  44. _ = torch_model(*inputs)
  45. torch.cuda.synchronize()
  46. torch_time = (time.time() - start_time) / num_iterations
  47. # 自定义 CUDA 内核计时
  48. torch.cuda.synchronize()
  49. start_time = time.time()
  50. for _ in range(num_iterations):
  51. _ = cuda_model(*inputs)
  52. torch.cuda.synchronize()
  53. cuda_time = (time.time() - start_time) / num_iterations
  54. print(f"PyTorch torch.relu 平均执行时间: {torch_time:.6f} 秒")
  55. print(f"自定义 CUDA 内核 平均执行时间: {cuda_time:.6f} 秒")
  56. speedup = 0
  57. if cuda_time > 0:
  58. speedup = torch_time / cuda_time
  59. print(f"加速比 (Speedup): {speedup:.2f}x")
  60. else:
  61. print("CUDA 内核执行时间为0,无法计算加速比。")
  62. return precision_flag,speedup
  63. if __name__ == "__main__":
  64. precision_flag,speedup = run_benchmark()