diff --git a/example/001-example/reduce_sum_performance.yaml b/example/001-example/reduce_sum_performance.yaml new file mode 100644 index 0000000..efa6879 --- /dev/null +++ b/example/001-example/reduce_sum_performance.yaml @@ -0,0 +1,26 @@ +# ReduceSum算法性能测试结果 +# 生成时间: 2025-08-19 15:42:19 + +algorithm: "ReduceSum" +data_types: + input: "float" + output: "float" +formulas: + throughput: "elements / time(s) / 1e9 (G/s)" +performance_data: + - data_size: 1000000 + time_ms: 0.059418 + throughput_gps: 16.830029 + data_type: "float" + - data_size: 134217728 + time_ms: 0.884915 + throughput_gps: 151.672977 + data_type: "float" + - data_size: 536870912 + time_ms: 2.811878 + throughput_gps: 190.929630 + data_type: "float" + - data_size: 1073741824 + time_ms: 5.376102 + throughput_gps: 199.724956 + data_type: "float" diff --git a/example/001-example/sort_pair_performance.yaml b/example/001-example/sort_pair_performance.yaml new file mode 100644 index 0000000..279fb3e --- /dev/null +++ b/example/001-example/sort_pair_performance.yaml @@ -0,0 +1,46 @@ +# SortPair算法性能测试结果 +# 生成时间: 2025-08-19 15:44:51 + +algorithm: "SortPair" +data_types: + key_type: "float" + value_type: "uint32_t" +formulas: + throughput: "elements / time(s) / 1e9 (G/s)" +performance_data: + - data_size: 1000000 + ascending: + time_ms: 0.645709 + throughput_gps: 1.548686 + descending: + time_ms: 0.643405 + throughput_gps: 1.554231 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + ascending: + time_ms: 48.160717 + throughput_gps: 2.786871 + descending: + time_ms: 48.198193 + throughput_gps: 2.784705 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + ascending: + time_ms: 192.608109 + throughput_gps: 2.787374 + descending: + time_ms: 192.794830 + throughput_gps: 2.784675 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + ascending: + time_ms: 391.377167 + throughput_gps: 2.743496 + descending: + time_ms: 391.900879 + throughput_gps: 2.739830 + key_type: "float" + value_type: "uint32_t" diff --git a/example/001-example/topk_pair_performance.yaml b/example/001-example/topk_pair_performance.yaml new file mode 100644 index 0000000..c9540eb --- /dev/null +++ b/example/001-example/topk_pair_performance.yaml @@ -0,0 +1,210 @@ +# TopkPair算法性能测试结果 +# 生成时间: 2025-08-19 15:48:37 + +algorithm: "TopkPair" +data_types: + key_type: "float" + value_type: "uint32_t" +formulas: + throughput: "elements / time(s) / 1e9 (G/s)" +performance_data: + - data_size: 1000000 + k_value: 32 + ascending: + time_ms: 0.678861 + throughput_gps: 1.473056 + descending: + time_ms: 0.686054 + throughput_gps: 1.457610 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 50 + ascending: + time_ms: 0.707123 + throughput_gps: 1.414181 + descending: + time_ms: 0.679578 + throughput_gps: 1.471502 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 100 + ascending: + time_ms: 0.700877 + throughput_gps: 1.426784 + descending: + time_ms: 0.671693 + throughput_gps: 1.488776 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 256 + ascending: + time_ms: 0.706432 + throughput_gps: 1.415564 + descending: + time_ms: 0.689306 + throughput_gps: 1.450735 + key_type: "float" + value_type: "uint32_t" + - data_size: 1000000 + k_value: 1024 + ascending: + time_ms: 0.663194 + throughput_gps: 1.507855 + descending: + time_ms: 0.660531 + throughput_gps: 1.513933 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 32 + ascending: + time_ms: 48.147915 + throughput_gps: 2.787612 + descending: + time_ms: 48.289310 + throughput_gps: 2.779450 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 50 + ascending: + time_ms: 48.196999 + throughput_gps: 2.784774 + descending: + time_ms: 48.226074 + throughput_gps: 2.783095 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 100 + ascending: + time_ms: 48.210712 + throughput_gps: 2.783981 + descending: + time_ms: 48.247169 + throughput_gps: 2.781878 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 256 + ascending: + time_ms: 48.231018 + throughput_gps: 2.782809 + descending: + time_ms: 48.235191 + throughput_gps: 2.782569 + key_type: "float" + value_type: "uint32_t" + - data_size: 134217728 + k_value: 1024 + ascending: + time_ms: 48.243660 + throughput_gps: 2.782080 + descending: + time_ms: 48.287209 + throughput_gps: 2.779571 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 32 + ascending: + time_ms: 192.694534 + throughput_gps: 2.786124 + descending: + time_ms: 192.709503 + throughput_gps: 2.785908 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 50 + ascending: + time_ms: 192.567810 + throughput_gps: 2.787958 + descending: + time_ms: 192.672104 + throughput_gps: 2.786449 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 100 + ascending: + time_ms: 192.633591 + throughput_gps: 2.787006 + descending: + time_ms: 192.675278 + throughput_gps: 2.786403 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 256 + ascending: + time_ms: 192.612732 + throughput_gps: 2.787307 + descending: + time_ms: 192.753647 + throughput_gps: 2.785270 + key_type: "float" + value_type: "uint32_t" + - data_size: 536870912 + k_value: 1024 + ascending: + time_ms: 192.627899 + throughput_gps: 2.787088 + descending: + time_ms: 192.446716 + throughput_gps: 2.789712 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 32 + ascending: + time_ms: 392.105988 + throughput_gps: 2.738397 + descending: + time_ms: 392.092041 + throughput_gps: 2.738494 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 50 + ascending: + time_ms: 391.682678 + throughput_gps: 2.741356 + descending: + time_ms: 392.191681 + throughput_gps: 2.737799 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 100 + ascending: + time_ms: 391.761108 + throughput_gps: 2.740808 + descending: + time_ms: 392.304626 + throughput_gps: 2.737010 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 256 + ascending: + time_ms: 391.693726 + throughput_gps: 2.741279 + descending: + time_ms: 392.074707 + throughput_gps: 2.738615 + key_type: "float" + value_type: "uint32_t" + - data_size: 1073741824 + k_value: 1024 + ascending: + time_ms: 391.690765 + throughput_gps: 2.741300 + descending: + time_ms: 392.122711 + throughput_gps: 2.738280 + key_type: "float" + value_type: "uint32_t" diff --git a/example/a guide to GPUKernelContest/README.md b/example/a guide to GPUKernelContest/README.md new file mode 100644 index 0000000..3385ae0 --- /dev/null +++ b/example/a guide to GPUKernelContest/README.md @@ -0,0 +1,168 @@ +# GPU 高性能并行计算算法优化竞赛 + +## 🎯 竞赛概述 + +本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。参赛者可选择实现三个核心算法的高性能版本: +- **ReduceSum**: 高精度归约求和 +- **SortPair**: 键值对稳定排序 +- **TopkPair**: 键值对TopK选择 + +## 🚀 快速开始 + +### 编译和测试 + +#### 1. 全量编译和运行 +```bash +# 编译并运行所有算法测试(默认行为) +./build_and_run.sh + +# 仅编译所有算法,不运行测试 +./build_and_run.sh --build-only + +# 编译并运行单个算法测试 +./build_and_run.sh --run_reduce # ReduceSum算法 +./build_and_run.sh --run_sort # SortPair算法 +./build_and_run.sh --run_topk # TopkPair算法 +``` + +#### 2. 单独编译和运行 +```bash +# 编译并运行ReduceSum算法(默认行为) +./build_and_run_reduce_sum.sh + +# 仅编译ReduceSum算法,不运行测试 +./build_and_run_reduce_sum.sh --build-only + +# 编译并运行SortPair正确性测试 +./build_and_run_sort_pair.sh --run correctness + +# 编译并运行TopkPair性能测试 +./build_and_run_topk_pair.sh --run performance +``` + +#### 3. 手动运行测试 +```bash +./build/test_reducesum [correctness|performance|all] +./build/test_sortpair [correctness|performance|all] +./build/test_topkpair [correctness|performance|all] +``` + +## 📝 参赛指南 + +### 实现位置 +参赛者需要在以下文件中替换Thrust实现: +- `src/reduce_sum_algorithm.maca` - 替换Thrust归约求和 +- `src/sort_pair_algorithm.maca` - 替换Thrust稳定排序 +- `src/topk_pair_algorithm.maca` - 替换Thrust TopK选择 + +### 算法要求 +见competition_parallel_algorithms.md + +## 📊 性能评测 + +### 测试流程 +1. **Warmup**: 5次预热运行 +2. **Benchmark**: 10次正式测试取平均 +3. **数据规模**: 1M, 128M, 512M, 1G elements +4. **评估指标**: 吞吐量(G/s) + +### 性能指标计算 + +#### ReduceSum +- **数据类型**: float → float +- **吞吐量**: elements / time(s) / 1e9 (G/s) + +#### SortPair +- **数据类型**: +- **吞吐量**: elements / time(s) / 1e9 (G/s) + +#### TopkPair +- **数据类型**: +- **吞吐量**: elements / time(s) / 1e9 (G/s) + +### 性能结果文件 +每个算法会生成详细的YAML性能分析文件: +- `reduce_sum_performance.yaml` - ReduceSum性能数据 +- `sort_pair_performance.yaml` - SortPair性能数据 +- `topk_pair_performance.yaml` - TopkPair性能数据 + +这些文件包含: +- 算法信息和数据类型 +- 计算公式说明 +- 各数据规模的详细性能数据 +- 升序/降序分别统计(适用时) + +## 📁 项目结构 + +``` +├── build_and_run.sh # 统一编译和运行脚本(默认编译+运行所有算法) +├── build_common.sh # 公共编译配置和函数 +├── build_and_run_reduce_sum.sh # ReduceSum独立编译和运行脚本 +├── build_and_run_sort_pair.sh # SortPair独立编译和运行脚本 +├── build_and_run_topk_pair.sh # TopkPair独立编译和运行脚本 +├── competition_parallel_algorithms.md # 详细题目说明 +├── src/ # 算法实现和工具文件 +│ ├── reduce_sum_algorithm.maca # 1. ReduceSum测试程序 +│ ├── sort_pair_algorithm.maca # 2. SortPair测试程序 +│ ├── topk_pair_algorithm.maca # 3. TopkPair测试程序 +│ ├── test_utils.h # 测试工具和CPU参考实现 +│ ├── yaml_reporter.h # YAML性能报告生成器 +│ └── performance_utils.h # 性能测试工具 +├── final_results/reduce_sum_results.yaml #ReduceSum性能数据 +├── final_results/sort_pair_results.yaml #替换Thrust稳定排序 +└── final_results/topk_pair_results.yaml #TopkPair性能数据 +``` + +## 🔧 开发工具 + +### 编译选项 +```bash +# 默认编译命令 +mxcc -O3 -std=c++17 --extended-lambda -Isrc + +### 自动化测试 +```bash +# 查看所有选项 +./build.sh --help + +# 运行所有测试并生成YAML报告 +./build.sh --run_all + +### 环境变量配置 + +| 变量 | 默认值 | 说明 | +|--------|--------|------| +| `COMPILER` | `mxcc` | CUDA编译器路径 | +| `COMPILER_FLAGS` | `-O3 -std=c++17 --extended-lambda` | 编译标志 | +| `INCLUDE_DIR` | `src` | 头文件目录 | +| `BUILD_DIR` | `build` | 构建输出目录 | + +### 调试模式 + +## 📋 提交清单 + +在提交前请确保: +- [ ] 所有算法通过正确性测试 +- [ ] 性能测试可以正常运行 +- [ ] 代码注释清晰,说明优化策略 +- [ ] 无内存泄漏或运行时错误 +- [ ] 生成完整测试报告 +- [ ] 在函数实现注释中说明创新点 + +# 提交时包含以下文件 +# - final_results/reduce_sum_results.yaml +# - final_results/sort_pair_results.yaml +# - final_results/topk_pair_results.yaml +``` + +## 🤝 技术支持 + +如有技术问题,请: +1. 查看详细错误信息和GPU状态 +2. 确认环境配置正确 +3. 检查内存使用是否超限 +4. 验证算法逻辑和数据类型 + +--- + +**祝您在竞赛中取得优异成绩!** 🏆 diff --git a/example/a guide to GPUKernelContest/build_and_run.sh b/example/a guide to GPUKernelContest/build_and_run.sh new file mode 100644 index 0000000..6dfeed5 --- /dev/null +++ b/example/a guide to GPUKernelContest/build_and_run.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# GPU高性能并行计算算法优化竞赛 - 统一编译脚本 +# 调用各个独立的build_and_run脚本来编译所有算法 + +# 加载公共配置 +source "$(dirname "$0")/build_common.sh" + +# 显示帮助信息 +show_help() { + echo "GPU算法竞赛统一编译和运行脚本" + echo "用法: $0 [选项]" + echo "" + echo "选项:" + echo " --help 显示帮助信息" + echo " --build-only 仅编译所有算法,不运行测试" + echo " --run_reduce 编译并运行ReduceSum算法测试" + echo " --run_sort 编译并运行SortPair算法测试" + echo " --run_topk 编译并运行TopkPair算法测试" + echo "" + echo "示例:" + echo " $0 # 编译并运行所有测试(默认行为)" + echo " $0 --build-only # 仅编译所有算法" + echo " $0 --run_sort # 编译并运行SortPair测试" + echo "" + echo "单独算法脚本:" + echo " ./build_and_run_reduce_sum.sh # 仅处理ReduceSum" + echo " ./build_and_run_sort_pair.sh # 仅处理SortPair" + echo " ./build_and_run_topk_pair.sh # 仅处理TopkPair" + echo "" +} + +# 解析命令行参数 +RUN_MODE="run_all" # 默认为编译并运行所有测试 + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + show_help + exit 0 + ;; + --build-only) + RUN_MODE="build_only" + shift + ;; + --run_reduce) + RUN_MODE="run_reduce" + shift + ;; + --run_sort) + RUN_MODE="run_sort" + shift + ;; + --run_topk) + RUN_MODE="run_topk" + shift + ;; + *) + print_error "未知选项: $1" + show_help + exit 1 + ;; + esac +done + +if [ "$RUN_MODE" = "build_only" ]; then + print_info "开始编译所有算法..." +else + print_info "开始编译并运行所有算法..." +fi +print_info "工作目录: $(pwd)" +print_info "编译时间: $(date '+%Y-%m-%d %H:%M:%S')" +show_build_config + +# 清理构建目录 +if [ -d "$BUILD_DIR" ]; then + print_info "清理现有构建目录: $BUILD_DIR" + rm -rf "$BUILD_DIR" +fi + +# 根据运行模式执行相应操作 +case "$RUN_MODE" in + "build_only") + print_info "编译所有算法..." + + print_info "[1/3] 编译ReduceSum..." + ./build_and_run_reduce_sum.sh --build-only + if [ $? -ne 0 ]; then + print_error "ReduceSum编译失败" + exit 1 + fi + + print_info "[2/3] 编译SortPair..." + ./build_and_run_sort_pair.sh --build-only + if [ $? -ne 0 ]; then + print_error "SortPair编译失败" + exit 1 + fi + + print_info "[3/3] 编译TopkPair..." + ./build_and_run_topk_pair.sh --build-only + if [ $? -ne 0 ]; then + print_error "TopkPair编译失败" + exit 1 + fi + + print_success "所有算法编译完成!" + echo "" + echo "可执行文件:" + echo " $BUILD_DIR/test_reducesum - ReduceSum算法测试" + echo " $BUILD_DIR/test_sortpair - SortPair算法测试" + echo " $BUILD_DIR/test_topkpair - TopkPair算法测试" + echo "" + echo "使用方法:" + echo " ./$BUILD_DIR/test_reducesum [correctness|performance|all]" + echo " ./$BUILD_DIR/test_sortpair [correctness|performance|all]" + echo " ./$BUILD_DIR/test_topkpair [correctness|performance|all]" + ;; + + "run_all") + print_info "编译并运行所有算法测试..." + + print_info "[1/3] ReduceSum..." + ./build_and_run_reduce_sum.sh --run all + + print_info "[2/3] SortPair..." + ./build_and_run_sort_pair.sh --run all + + print_info "[3/3] TopkPair..." + ./build_and_run_topk_pair.sh --run all + + print_success "所有测试完成!" + ;; + + "run_reduce") + print_info "编译并运行ReduceSum测试..." + ./build_and_run_reduce_sum.sh --run all + ;; + + "run_sort") + print_info "编译并运行SortPair测试..." + ./build_and_run_sort_pair.sh --run all + ;; + + "run_topk") + print_info "编译并运行TopkPair测试..." + ./build_and_run_topk_pair.sh --run all + ;; +esac \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/build_and_run_reduce_sum.sh b/example/a guide to GPUKernelContest/build_and_run_reduce_sum.sh new file mode 100644 index 0000000..5e6df35 --- /dev/null +++ b/example/a guide to GPUKernelContest/build_and_run_reduce_sum.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# ReduceSum算法编译和运行脚本 +# 用于参赛者只选择ReduceSum题目时使用 + +# 加载公共配置 +source "$(dirname "$0")/build_common.sh" + +# 显示帮助信息 +show_help() { + echo "ReduceSum算法编译和运行脚本" + echo "用法: $0 [选项]" + echo "" + echo "选项:" + echo " --help 显示此帮助信息" + echo " --build-only 仅编译,不运行测试" + echo " --run MODE 编译并运行测试 (MODE: correctness|performance|all)" + echo "" + echo "示例:" + echo " $0 # 编译并运行所有测试(默认行为)" + echo " $0 --build-only # 仅编译" + echo " $0 --run correctness # 编译并运行正确性测试" + echo "" +} + +# 解析命令行参数 +RUN_MODE="all" # 默认运行所有测试 + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + show_help + exit 0 + ;; + --build-only) + RUN_MODE="" + shift + ;; + --run) + RUN_MODE="$2" + shift 2 + ;; + *) + print_error "未知选项: $1" + show_help + exit 1 + ;; + esac +done + +# 编译ReduceSum算法 +if compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then + # 如果指定了运行模式,则运行测试 + if [ -n "$RUN_MODE" ]; then + echo "" + run_single_test "ReduceSum" "$RUN_MODE" + fi +else + exit 1 +fi \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/build_and_run_sort_pair.sh b/example/a guide to GPUKernelContest/build_and_run_sort_pair.sh new file mode 100644 index 0000000..c28d05a --- /dev/null +++ b/example/a guide to GPUKernelContest/build_and_run_sort_pair.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# SortPair算法编译和运行脚本 +# 用于参赛者只选择SortPair题目时使用 + +# 加载公共配置 +source "$(dirname "$0")/build_common.sh" + +# 显示帮助信息 +show_help() { + echo "SortPair算法编译和运行脚本" + echo "用法: $0 [选项]" + echo "" + echo "选项:" + echo " --help 显示此帮助信息" + echo " --build-only 仅编译,不运行测试" + echo " --run MODE 编译并运行测试 (MODE: correctness|performance|all)" + echo "" + echo "示例:" + echo " $0 # 编译并运行所有测试(默认行为)" + echo " $0 --build-only # 仅编译" + echo " $0 --run correctness # 编译并运行正确性测试" + echo "" +} + +# 解析命令行参数 +RUN_MODE="all" # 默认运行所有测试 + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + show_help + exit 0 + ;; + --build-only) + RUN_MODE="" + shift + ;; + --run) + RUN_MODE="$2" + shift 2 + ;; + *) + print_error "未知选项: $1" + show_help + exit 1 + ;; + esac +done + +# 编译SortPair算法 +if compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then + # 如果指定了运行模式,则运行测试 + if [ -n "$RUN_MODE" ]; then + echo "" + run_single_test "SortPair" "$RUN_MODE" + fi +else + exit 1 +fi \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/build_and_run_topk_pair.sh b/example/a guide to GPUKernelContest/build_and_run_topk_pair.sh new file mode 100644 index 0000000..242ba45 --- /dev/null +++ b/example/a guide to GPUKernelContest/build_and_run_topk_pair.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# TopkPair算法编译和运行脚本 +# 用于参赛者只选择TopkPair题目时使用 + +# 加载公共配置 +source "$(dirname "$0")/build_common.sh" + +# 显示帮助信息 +show_help() { + echo "TopkPair算法编译和运行脚本" + echo "用法: $0 [选项]" + echo "" + echo "选项:" + echo " --help 显示此帮助信息" + echo " --build-only 仅编译,不运行测试" + echo " --run MODE 编译并运行测试 (MODE: correctness|performance|all)" + echo "" + echo "示例:" + echo " $0 # 编译并运行所有测试(默认行为)" + echo " $0 --build-only # 仅编译" + echo " $0 --run correctness # 编译并运行正确性测试" + echo "" +} + +# 解析命令行参数 +RUN_MODE="all" # 默认运行所有测试 + +while [[ $# -gt 0 ]]; do + case $1 in + --help) + show_help + exit 0 + ;; + --build-only) + RUN_MODE="" + shift + ;; + --run) + RUN_MODE="$2" + shift 2 + ;; + *) + print_error "未知选项: $1" + show_help + exit 1 + ;; + esac +done + +# 编译TopkPair算法 +if compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then + # 如果指定了运行模式,则运行测试 + if [ -n "$RUN_MODE" ]; then + echo "" + run_single_test "TopkPair" "$RUN_MODE" + fi +else + exit 1 +fi \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/build_common.sh b/example/a guide to GPUKernelContest/build_common.sh new file mode 100644 index 0000000..1564496 --- /dev/null +++ b/example/a guide to GPUKernelContest/build_common.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +# GPU算法竞赛公共编译配置 +# 被各个独立编译脚本调用 + +# 设置颜色 +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[0;33m' +NC='\033[0m' # No Color + +# 打印函数 +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +# 编译配置 - 可通过环境变量自定义 +COMPILER=${COMPILER:-mxcc} + +#COMPILER_FLAGS=${COMPILER_FLAGS:--O3 -std=c++17 --extended-lambda} # not run all test for easy debug +COMPILER_FLAGS=${COMPILER_FLAGS:--O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST} + +INCLUDE_DIR=${INCLUDE_DIR:-src} +BUILD_DIR=${BUILD_DIR:-build} + +# 编译单个算法的通用函数 +# 参数: $1=算法名称, $2=源文件名 +compile_algorithm() { + local algo_name="$1" + local source_file="$2" + local target_file="$BUILD_DIR/test_${algo_name,,}" # 转换为小写 + + print_info "编译 $algo_name 算法..." + + # 创建构建目录 + mkdir -p "$BUILD_DIR" + + # 编译命令 + local compile_cmd="$COMPILER $COMPILER_FLAGS -I$INCLUDE_DIR src/$source_file -o $target_file" + + print_info "执行: $compile_cmd" + + if $compile_cmd; then + print_success "$algo_name 编译完成!" + echo "" + echo "运行测试:" + echo " ./$target_file [correctness|performance|all]" + return 0 + else + print_error "$algo_name 编译失败!" + return 1 + fi +} + +# 显示编译配置信息 +show_build_config() { + print_info "编译配置:" + echo " COMPILER: $COMPILER" + echo " COMPILER_FLAGS: $COMPILER_FLAGS" + echo " INCLUDE_DIR: $INCLUDE_DIR" + echo " BUILD_DIR: $BUILD_DIR" + echo "" +} + +# 运行单个测试 +run_single_test() { + local algo_name="$1" + local test_mode="${2:-all}" + local test_file="$BUILD_DIR/test_${algo_name,,}" + + if [ -f "$test_file" ]; then + print_info "运行 $algo_name 测试 (模式: $test_mode)..." + "./$test_file" "$test_mode" + return $? + else + print_error "$algo_name 测试程序不存在: $test_file" + return 1 + fi +} diff --git a/example/a guide to GPUKernelContest/competition_parallel_algorithms.md b/example/a guide to GPUKernelContest/competition_parallel_algorithms.md new file mode 100644 index 0000000..6cf1efd --- /dev/null +++ b/example/a guide to GPUKernelContest/competition_parallel_algorithms.md @@ -0,0 +1,103 @@ +# 题目: +## GPU高性能并行计算算法优化 + +要求参赛者通过一个或多个global kernel 函数(允许配套 device 辅助函数),实现高性能算法。 + +在正确性、稳定性前提下,比拼算法性能。 + + +# 1. ReduceSum算法优化 +```cpp +template +class ReduceSumAlgorithm { +public: + // 主要接口函数 - 参赛者需要实现这个函数 + void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) { + // TODO + } +}; +``` +其中 + +* 数据类型:InputT: float, OutputT: float +* 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 +* 假定输入d\_in数据量为num\_items + + +注意事项 + +* 累计误差不大于cpu double golden基准的0.5% +* 注意针对NAN和INF等异常值的处理 + + + +加分项 + +* 使用tensor core计算reduce +* 覆盖更全面的数据范围,提供良好稳定的性能表现 + + +# 2. Sort Pair算法优化 +```cpp +template +class SortPairAlgorithm { +public: + // 主要接口函数 - 参赛者需要实现这个函数 + void sort(const KeyType* d_keys_in, KeyType* d_keys_out, + const ValueType* d_values_in, ValueType* d_values_out, + int num_items, bool descending) { + // TODO + } +}; +``` +其中 + +* 数据类型:key: float, value: int32\_t +* 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 +* 假定输入、输出的key和value的数据量一致,均为num\_items + + +注意事项 + +* 需要校验结果正确性 +* 结果必须稳定排序 + + +加分项 + +* 支持其他不同数据类型的排序,如half、double、int32_t等 +* 覆盖更全面的数据范围,提供良好稳定的性能表现 + + + +# 3. Topk Pair算法优化 +```cpp +template +class TopkPairAlgorithm { +public: + // 主要接口函数 - 参赛者需要实现这个函数 + void topk(const KeyType* d_keys_in, KeyType* d_keys_out, + const ValueType* d_values_in, ValueType* d_values_out, + int num_items, int k, bool descending) { + // TODO + } +}; +``` +其中 + +* 数据类型:key: float, value: int32\_t +* 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 +* 假定输入的key和value的数据量一致,为num\_items;输出的key和value的数据量一致,为k +* k的范围:32,50,100,256,1024。k不大于num\_items + + +注意事项 + +* 结果必须稳定排序 + + +加分项 + +* 支持其他不同数据类型的键值对,实现类型通用算法 +* 覆盖更全面的数据范围,提供良好稳定的性能表现 + diff --git a/example/a guide to GPUKernelContest/src/performance_utils.h b/example/a guide to GPUKernelContest/src/performance_utils.h new file mode 100644 index 0000000..0fcefe2 --- /dev/null +++ b/example/a guide to GPUKernelContest/src/performance_utils.h @@ -0,0 +1,114 @@ +#pragma once +#include +#include +#include + +// ============================================================================ +// 性能计算和显示工具 +// ============================================================================ + +class PerformanceCalculator { +public: + // ReduceSum性能计算 + struct ReduceSumMetrics { + double throughput_gps; // G elements/s + }; + + static ReduceSumMetrics calculateReduceSum(int size, float time_ms) { + ReduceSumMetrics metrics; + metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); + return metrics; + } + + // SortPair性能计算 + struct SortPairMetrics { + double throughput_gps; // G elements/s + }; + + static SortPairMetrics calculateSortPair(int size, float time_ms) { + SortPairMetrics metrics; + metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); + return metrics; + } + + // TopkPair性能计算 + struct TopkPairMetrics { + double throughput_gps; // G elements/s + }; + + static TopkPairMetrics calculateTopkPair(int size, int k, float time_ms) { + TopkPairMetrics metrics; + metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); + return metrics; + } +}; + +// ============================================================================ +// 性能显示工具 +// ============================================================================ + +class PerformanceDisplay { +public: + // 显示ReduceSum性能表头 + static void printReduceSumHeader() { + std::cout << "\nReduceSum 性能测试..." << std::endl; + std::cout << "数据类型: float -> float" << std::endl; + std::cout << "计算公式:" << std::endl; + std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; + std::cout << std::setw(12) << "数据规模" << std::setw(15) << "时间(ms)" + << std::setw(20) << "吞吐量(G/s)" << std::endl; + std::cout << std::string(47, '-') << std::endl; + } + + // 显示SortPair性能表头 + static void printSortPairHeader() { + std::cout << "\nSortPair 性能测试..." << std::endl; + std::cout << "数据类型: " << std::endl; + std::cout << "计算公式:" << std::endl; + std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; + std::cout << std::setw(12) << "数据规模" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" + << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; + std::cout << std::string(78, '-') << std::endl; + } + + // 显示TopkPair性能表头 + static void printTopkPairHeader() { + std::cout << "\nTopkPair 性能测试..." << std::endl; + std::cout << "数据类型: " << std::endl; + std::cout << "计算公式:" << std::endl; + std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; + } + + static void printTopkPairDataHeader() { + std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" + << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; + std::cout << std::string(74, '-') << std::endl; + } + + // 显示性能数据行 + static void printReduceSumData(int size, float time_ms, const PerformanceCalculator::ReduceSumMetrics& metrics) { + std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) + << time_ms << std::setw(20) << std::setprecision(3) << metrics.throughput_gps << std::endl; + } + + static void printSortPairData(int size, float asc_time, float desc_time, + const PerformanceCalculator::SortPairMetrics& asc_metrics, + const PerformanceCalculator::SortPairMetrics& desc_metrics) { + std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) + << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) + << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl; + } + + static void printTopkPairData(int k, float asc_time, float desc_time, + const PerformanceCalculator::TopkPairMetrics& asc_metrics, + const PerformanceCalculator::TopkPairMetrics& desc_metrics) { + std::cout << std::setw(8) << k << std::setw(15) << std::fixed << std::setprecision(3) + << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) + << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl; + } + + // 显示性能文件保存消息 + static void printSavedMessage(const std::string& filename) { + std::cout << "\n性能结果已保存到: " << filename << std::endl; + } +}; \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/src/reduce_sum_algorithm.maca b/example/a guide to GPUKernelContest/src/reduce_sum_algorithm.maca new file mode 100644 index 0000000..4f95d03 --- /dev/null +++ b/example/a guide to GPUKernelContest/src/reduce_sum_algorithm.maca @@ -0,0 +1,277 @@ +#include "test_utils.h" +#include "performance_utils.h" +#include "yaml_reporter.h" +#include +#include +#include + + +// ============================================================================ +// 实现标记宏 - 参赛者修改实现时请将此宏设为0 +// ============================================================================ +#ifndef USE_DEFAULT_REF_IMPL +#define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 +#endif + +#if USE_DEFAULT_REF_IMPL +#include +#include +#include +#include +#endif + +// 误差容忍度 +constexpr double REDUCE_ERROR_TOLERANCE = 0.005; // 0.5% + +// ============================================================================ +// ReduceSum算法实现接口 +// 参赛者需要替换Thrust实现为自己的高性能kernel +// ============================================================================ + +template +class ReduceSumAlgorithm { +public: + // 主要接口函数 - 参赛者需要实现这个函数 + void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) { + +#if !USE_DEFAULT_REF_IMPL + // ======================================== + // 参赛者自定义实现区域 + // ======================================== + + // TODO: 参赛者在此实现自己的高性能归约算法 + + // 示例:参赛者可以调用1个或多个自定义kernel + // blockReduceKernel<<>>(d_in, temp_results, num_items, init_value); + // finalReduceKernel<<<1, block>>>(temp_results, d_out, grid.x); +#else + // ======================================== + // 默认基准实现 + // ======================================== + auto input_ptr = thrust::device_pointer_cast(d_in); + auto output_ptr = thrust::device_pointer_cast(d_out); + + // 直接使用thrust::reduce进行归约 + *output_ptr = thrust::reduce( + thrust::device, + input_ptr, + input_ptr + num_items, + static_cast(init_value) + ); +#endif + } + + // 获取当前实现状态 + static const char* getImplementationStatus() { +#if USE_DEFAULT_REF_IMPL + return "DEFAULT_REF_IMPL"; +#else + return "CUSTOM_IMPL"; +#endif + } + +private: + // 参赛者可以在这里添加辅助函数和成员变量 + // 例如:中间结果缓冲区、多阶段归约等 +}; + +// ============================================================================ +// 测试和性能评估 +// ============================================================================ + +bool testCorrectness() { + std::cout << "ReduceSum 正确性测试..." << std::endl; + TestDataGenerator generator; + ReduceSumAlgorithm algorithm; + + bool allPassed = true; + + // 测试不同数据规模 + for (int i = 0; i < NUM_TEST_SIZES && i < 2; i++) { // 限制测试规模 + int size = std::min(TEST_SIZES[i], 10000); + std::cout << " 测试规模: " << size << std::endl; + + // 测试普通数据 + { + auto data = generator.generateRandomFloats(size, -10.0f, 10.0f); + float init_value = 1.0f; + + // CPU参考计算 + double cpu_result = cpuReduceSum(data, static_cast(init_value)); + + // GPU计算 + float *d_in; + float *d_out; + MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_out, sizeof(float))); + + MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); + + algorithm.reduce(d_in, d_out, size, init_value); + + float gpu_result; + MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost)); + + // 验证误差 + double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result); + if (relative_error > REDUCE_ERROR_TOLERANCE) { + std::cout << " 失败: 误差过大 " << relative_error << std::endl; + allPassed = false; + } else { + std::cout << " 通过 (误差: " << relative_error << ")" << std::endl; + } + + mcFree(d_in); + mcFree(d_out); + } + + // 测试特殊值 (NaN, Inf) + if (size > 100) { + std::cout << " 测试特殊值..." << std::endl; + auto data = generator.generateSpecialFloats(size); + float init_value = 0.0f; + + double cpu_result = cpuReduceSum(data, static_cast(init_value)); + + float *d_in; + float *d_out; + MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_out, sizeof(float))); + + MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); + + algorithm.reduce(d_in, d_out, size, init_value); + + float gpu_result; + MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost)); + + // 对于包含特殊值的情况,检查是否正确处理 + if (std::isfinite(cpu_result) && std::isfinite(gpu_result)) { + double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result); + if (relative_error > REDUCE_ERROR_TOLERANCE) { + std::cout << " 失败: 特殊值处理错误" << std::endl; + allPassed = false; + } else { + std::cout << " 通过 (特殊值处理)" << std::endl; + } + } else { + std::cout << " 通过 (特殊值结果)" << std::endl; + } + + mcFree(d_in); + mcFree(d_out); + } + } + + return allPassed; +} + +void benchmarkPerformance() { + PerformanceDisplay::printReduceSumHeader(); + + TestDataGenerator generator; + PerformanceMeter meter; + ReduceSumAlgorithm algorithm; + + const int WARMUP_ITERATIONS = 5; + const int BENCHMARK_ITERATIONS = 10; + + // 用于YAML报告的数据收集 + std::vector> perf_data; + + for (int i = 0; i < NUM_TEST_SIZES; i++) { + int size = TEST_SIZES[i]; + + // 生成测试数据 + auto data = generator.generateRandomFloats(size); + float init_value = 0.0f; + + // 分配GPU内存 + float *d_in; + float *d_out; + MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_out, sizeof(float))); + + MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); + + // Warmup阶段 + for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { + algorithm.reduce(d_in, d_out, size, init_value); + } + + // 正式测试阶段 + float total_time = 0; + for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { + meter.startTiming(); + algorithm.reduce(d_in, d_out, size, init_value); + total_time += meter.stopTiming(); + } + + float avg_time = total_time / BENCHMARK_ITERATIONS; + + // 计算性能指标 + auto metrics = PerformanceCalculator::calculateReduceSum(size, avg_time); + + // 显示性能数据 + PerformanceDisplay::printReduceSumData(size, avg_time, metrics); + + // 收集YAML报告数据 + auto entry = YAMLPerformanceReporter::createEntry(); + entry["data_size"] = std::to_string(size); + entry["time_ms"] = std::to_string(avg_time); + entry["throughput_gps"] = std::to_string(metrics.throughput_gps); + entry["data_type"] = "float"; + perf_data.push_back(entry); + + mcFree(d_in); + mcFree(d_out); + } + + // 生成YAML性能报告 + YAMLPerformanceReporter::generateReduceSumYAML(perf_data, "reduce_sum_performance.yaml"); + PerformanceDisplay::printSavedMessage("reduce_sum_performance.yaml"); +} + +// ============================================================================ +// 主函数 +// ============================================================================ +int main(int argc, char* argv[]) { + std::cout << "=== ReduceSum 算法测试 ===" << std::endl; + + // 检查参数 + std::string mode = "all"; + if (argc > 1) { + mode = argv[1]; + } + + bool correctness_passed = true; + bool performance_completed = true; + + try { + if (mode == "correctness" || mode == "all") { + correctness_passed = testCorrectness(); + } + + if (mode == "performance" || mode == "all") { + if (correctness_passed || mode == "performance") { + benchmarkPerformance(); + } else { + std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; + performance_completed = false; + } + } + + std::cout << "\n=== 测试完成 ===" << std::endl; + std::cout << "实现状态: " << ReduceSumAlgorithm::getImplementationStatus() << std::endl; + if (mode == "all") { + std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; + std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; + } + + return correctness_passed ? 0 : 1; + + } catch (const std::exception& e) { + std::cerr << "测试出错: " << e.what() << std::endl; + return 1; + } +} \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/src/sort_pair_algorithm.maca b/example/a guide to GPUKernelContest/src/sort_pair_algorithm.maca new file mode 100644 index 0000000..9cdb6b3 --- /dev/null +++ b/example/a guide to GPUKernelContest/src/sort_pair_algorithm.maca @@ -0,0 +1,275 @@ +#include "test_utils.h" +#include "performance_utils.h" +#include "yaml_reporter.h" +#include +#include +#include + +// ============================================================================ +// 实现标记宏 - 参赛者修改实现时请将此宏设为0 +// ============================================================================ +#ifndef USE_DEFAULT_REF_IMPL +#define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 +#endif + +#if USE_DEFAULT_REF_IMPL +#include +#include +#include +#include +#include +#endif + +// ============================================================================ +// SortPair算法实现接口 +// 参赛者需要替换Thrust实现为自己的高性能kernel +// ============================================================================ + +template +class SortPairAlgorithm { +public: + // 主要接口函数 - 参赛者需要实现这个函数 + void sort(const KeyType* d_keys_in, KeyType* d_keys_out, + const ValueType* d_values_in, ValueType* d_values_out, + int num_items, bool descending) { + +#if !USE_DEFAULT_REF_IMPL + // ======================================== + // 参赛者自定义实现区域 + // ======================================== + + // TODO: 参赛者在此实现自己的高性能排序算法 + + // 示例:参赛者可以调用1个或多个自定义kernel + // preprocessKernel<<>>(d_keys_in, d_values_in, num_items); + // mainSortKernel<<>>(d_keys_out, d_values_out, num_items, descending); + // postprocessKernel<<>>(d_keys_out, d_values_out, num_items); +#else + // ======================================== + // 默认基准实现 + // ======================================== + + MACA_CHECK(mcMemcpy(d_keys_out, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice)); + MACA_CHECK(mcMemcpy(d_values_out, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice)); + + auto key_ptr = thrust::device_pointer_cast(d_keys_out); + auto value_ptr = thrust::device_pointer_cast(d_values_out); + + if (descending) { + thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater()); + } else { + thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less()); + } +#endif + } + + // 获取当前实现状态 + static const char* getImplementationStatus() { +#if USE_DEFAULT_REF_IMPL + return "DEFAULT_REF_IMPL"; +#else + return "CUSTOM_IMPL"; +#endif + } + +private: + // 参赛者可以在这里添加辅助函数和成员变量 + // 例如:临时缓冲区、多个kernel函数、流等 +}; + +// ============================================================================ +// 测试和性能评估 +// ============================================================================ + +bool testCorrectness() { + std::cout << "SortPair 正确性测试..." << std::endl; + TestDataGenerator generator; + SortPairAlgorithm algorithm; + + // 测试小规模数据 + int size = 10000; + auto keys = generator.generateRandomFloats(size); + auto values = generator.generateRandomUint32(size); + + // 分配GPU内存 + float *d_keys_in, *d_keys_out; + uint32_t *d_values_in, *d_values_out; + + MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); + MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t))); + + MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); + MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); + + // 测试升序和降序 + bool allPassed = true; + for (bool descending : {false, true}) { + std::cout << " " << (descending ? "降序" : "升序") << " 测试..." << std::endl; + + // CPU参考结果 + auto cpu_keys = keys; + auto cpu_values = values; + cpuSortPair(cpu_keys, cpu_values, descending); + + // GPU算法结果 + algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); + + // 获取结果 + std::vector gpu_keys(size); + std::vector gpu_values(size); + MACA_CHECK(mcMemcpy(gpu_keys.data(), d_keys_out, size * sizeof(float), mcMemcpyDeviceToHost)); + MACA_CHECK(mcMemcpy(gpu_values.data(), d_values_out, size * sizeof(uint32_t), mcMemcpyDeviceToHost)); + + // 验证结果 + bool keysMatch = compareArrays(cpu_keys, gpu_keys, 1e-5); + bool valuesMatch = compareArrays(cpu_values, gpu_values); + + if (!keysMatch || !valuesMatch) { + std::cout << " 失败: 结果不匹配" << std::endl; + allPassed = false; + } else { + std::cout << " 通过" << std::endl; + } + } + + // 清理内存 + mcFree(d_keys_in); + mcFree(d_keys_out); + mcFree(d_values_in); + mcFree(d_values_out); + + return allPassed; +} + +void benchmarkPerformance() { + PerformanceDisplay::printSortPairHeader(); + + TestDataGenerator generator; + PerformanceMeter meter; + SortPairAlgorithm algorithm; + + const int WARMUP_ITERATIONS = 5; + const int BENCHMARK_ITERATIONS = 10; + + // 用于YAML报告的数据收集 + std::vector> perf_data; + + for (int i = 0; i < NUM_TEST_SIZES; i++) { + int size = TEST_SIZES[i]; + + // 生成测试数据 + auto keys = generator.generateRandomFloats(size); + auto values = generator.generateRandomUint32(size); + + // 分配GPU内存 + float *d_keys_in, *d_keys_out; + uint32_t *d_values_in, *d_values_out; + + MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); + MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t))); + + MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); + MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); + + float asc_time = 0, desc_time = 0; + + // 测试升序和降序 + for (bool descending : {false, true}) { + // Warmup阶段 + for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { + algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); + } + + // 正式测试阶段 + float total_time = 0; + for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { + meter.startTiming(); + algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); + total_time += meter.stopTiming(); + } + + float avg_time = total_time / BENCHMARK_ITERATIONS; + if (descending) { + desc_time = avg_time; + } else { + asc_time = avg_time; + } + } + + // 计算性能指标 + auto asc_metrics = PerformanceCalculator::calculateSortPair(size, asc_time); + auto desc_metrics = PerformanceCalculator::calculateSortPair(size, desc_time); + + // 显示性能数据 + PerformanceDisplay::printSortPairData(size, asc_time, desc_time, asc_metrics, desc_metrics); + + // 收集YAML报告数据 + auto entry = YAMLPerformanceReporter::createEntry(); + entry["data_size"] = std::to_string(size); + entry["asc_time_ms"] = std::to_string(asc_time); + entry["desc_time_ms"] = std::to_string(desc_time); + entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps); + entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps); + entry["key_type"] = "float"; + entry["value_type"] = "uint32_t"; + perf_data.push_back(entry); + + // 清理内存 + mcFree(d_keys_in); + mcFree(d_keys_out); + mcFree(d_values_in); + mcFree(d_values_out); + } + + // 生成YAML性能报告 + YAMLPerformanceReporter::generateSortPairYAML(perf_data, "sort_pair_performance.yaml"); + PerformanceDisplay::printSavedMessage("sort_pair_performance.yaml"); +} + +// ============================================================================ +// 主函数 +// ============================================================================ +int main(int argc, char* argv[]) { + std::cout << "=== SortPair 算法测试 ===" << std::endl; + + // 检查参数 + std::string mode = "all"; + if (argc > 1) { + mode = argv[1]; + } + + bool correctness_passed = true; + bool performance_completed = true; + + try { + if (mode == "correctness" || mode == "all") { + correctness_passed = testCorrectness(); + } + + if (mode == "performance" || mode == "all") { + if (correctness_passed || mode == "performance") { + benchmarkPerformance(); + } else { + std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; + performance_completed = false; + } + } + + std::cout << "\n=== 测试完成 ===" << std::endl; + std::cout << "实现状态: " << SortPairAlgorithm::getImplementationStatus() << std::endl; + if (mode == "all") { + std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; + std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; + } + + return correctness_passed ? 0 : 1; + + } catch (const std::exception& e) { + std::cerr << "测试出错: " << e.what() << std::endl; + return 1; + } +} \ No newline at end of file diff --git a/example/a guide to GPUKernelContest/src/test_utils.h b/example/a guide to GPUKernelContest/src/test_utils.h new file mode 100644 index 0000000..57e5622 --- /dev/null +++ b/example/a guide to GPUKernelContest/src/test_utils.h @@ -0,0 +1,234 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +// 引入模块化头文件 +#include "yaml_reporter.h" +#include "performance_utils.h" + +// ============================================================================ +// 测试配置常量 +// ============================================================================ +#ifndef RUN_FULL_TEST +const int TEST_SIZES[] = {1000000, 134217728}; // 1M, 128M, 512M, 1G +#else +const int TEST_SIZES[] = {1000000, 134217728, 536870912, 1073741824}; // 1M, 128M, 512M, 1G +#endif + +const int NUM_TEST_SIZES = sizeof(TEST_SIZES) / sizeof(TEST_SIZES[0]); + +// 性能测试重复次数 +constexpr int WARMUP_ITERATIONS = 5; +constexpr int BENCHMARK_ITERATIONS = 10; + + +// ============================================================================ +// 错误检查宏 +// ============================================================================ +#define MACA_CHECK(call) \ + do { \ + mcError_t error = call; \ + if (error != mcSuccess) { \ + std::cerr << "MACA error at " << __FILE__ << ":" << __LINE__ \ + << " - " << mcGetErrorString(error) << std::endl; \ + exit(1); \ + } \ + } while(0) + +// ============================================================================ +// 测试数据生成器 +// ============================================================================ +class TestDataGenerator { +private: + std::mt19937 rng; + +public: + TestDataGenerator(uint32_t seed = 42) : rng(seed) {} + + // 生成随机float数组 + std::vector generateRandomFloats(int size, float min_val = -1000.0f, float max_val = 1000.0f) { + std::vector data(size); + std::uniform_real_distribution dist(min_val, max_val); + for (int i = 0; i < size; i++) { + data[i] = dist(rng); + } + return data; + } + + // 生成随机half数组 + std::vector generateRandomHalfs(int size, float min_val = -100.0f, float max_val = 100.0f) { + std::vector data(size); + std::uniform_real_distribution dist(min_val, max_val); + for (int i = 0; i < size; i++) { + data[i] = __float2half(dist(rng)); + } + return data; + } + + // 生成随机uint32_t数组 + std::vector generateRandomUint32(int size) { + std::vector data(size); + for (int i = 0; i < size; i++) { + data[i] = static_cast(i); // 使用索引作为值,便于验证稳定排序 + } + return data; + } + + // 生成随机int64_t数组 + std::vector generateRandomInt64(int size) { + std::vector data(size); + for (int i = 0; i < size; i++) { + data[i] = static_cast(i); + } + return data; + } + + // 生成包含NaN和Inf的测试数据 (half版本) + std::vector generateSpecialHalfs(int size) { + std::vector data = generateRandomHalfs(size, -10.0f, 10.0f); + if (size > 100) { + data[10] = __float2half(NAN); + data[20] = __float2half(INFINITY); + data[30] = __float2half(-INFINITY); + } + return data; + } + + // 生成包含NaN和Inf的测试数据 (float版本) + std::vector generateSpecialFloats(int size) { + std::vector data = generateRandomFloats(size, -10.0f, 10.0f); + if (size > 100) { + data[10] = NAN; + data[20] = INFINITY; + data[30] = -INFINITY; + } + return data; + } +}; + +// ============================================================================ +// 性能测试工具 +// ============================================================================ +class PerformanceMeter { +private: + mcEvent_t start, stop; + +public: + PerformanceMeter() { + MACA_CHECK(mcEventCreate(&start)); + MACA_CHECK(mcEventCreate(&stop)); + } + + ~PerformanceMeter() { + mcEventDestroy(start); + mcEventDestroy(stop); + } + + void startTiming() { + MACA_CHECK(mcEventRecord(start)); + } + + float stopTiming() { + MACA_CHECK(mcEventRecord(stop)); + MACA_CHECK(mcEventSynchronize(stop)); + float milliseconds = 0; + MACA_CHECK(mcEventElapsedTime(&milliseconds, start, stop)); + return milliseconds; + } +}; + +// ============================================================================ +// 正确性验证工具 +// ============================================================================ +template +bool compareArrays(const std::vector& a, const std::vector& b, double tolerance = 1e-6) { + if (a.size() != b.size()) return false; + + for (size_t i = 0; i < a.size(); i++) { + if constexpr (std::is_same_v) { + float fa = __half2float(a[i]); + float fb = __half2float(b[i]); + if (std::isnan(fa) && std::isnan(fb)) continue; + if (std::isinf(fa) && std::isinf(fb) && (fa > 0) == (fb > 0)) continue; + if (std::abs(fa - fb) > tolerance) return false; + } else if constexpr (std::is_floating_point_v) { + if (std::isnan(a[i]) && std::isnan(b[i])) continue; + if (std::isinf(a[i]) && std::isinf(b[i]) && (a[i] > 0) == (b[i] > 0)) continue; + if (std::abs(a[i] - b[i]) > tolerance) return false; + } else { + if (a[i] != b[i]) return false; + } + } + return true; +} + +// CPU参考实现 - 稳定排序 +template +void cpuSortPair(std::vector& keys, std::vector& values, bool descending) { + std::vector> pairs; + for (size_t i = 0; i < keys.size(); i++) { + pairs.emplace_back(keys[i], values[i]); + } + + if (descending) { + std::stable_sort(pairs.begin(), pairs.end(), + [](const auto& a, const auto& b) { return a.first > b.first; }); + } else { + std::stable_sort(pairs.begin(), pairs.end()); + } + + for (size_t i = 0; i < pairs.size(); i++) { + keys[i] = pairs[i].first; + values[i] = pairs[i].second; + } +} + +// CPU参考实现 - TopK +template +void cpuTopkPair(const std::vector& keys_in, const std::vector& values_in, + std::vector& keys_out, std::vector& values_out, + int k, bool descending) { + std::vector> pairs; + for (size_t i = 0; i < keys_in.size(); i++) { + pairs.emplace_back(keys_in[i], values_in[i]); + } + + if (descending) { + std::stable_sort(pairs.begin(), pairs.end(), + [](const auto& a, const auto& b) { return a.first > b.first; }); + } else { + std::stable_sort(pairs.begin(), pairs.end()); + } + + keys_out.resize(k); + values_out.resize(k); + for (int i = 0; i < k; i++) { + keys_out[i] = pairs[i].first; + values_out[i] = pairs[i].second; + } +} + +// CPU参考实现 - ReduceSum (使用double精度) +template +double cpuReduceSum(const std::vector& data, double init_value) { + double sum = init_value; + for (const auto& val : data) { + if constexpr (std::is_same_v) { + float f_val = __half2float(val); + if (!std::isnan(f_val)) { + sum += static_cast(f_val); + } + } else { + if (!std::isnan(val)) { + sum += static_cast(val); + } + } + } + return sum; +} diff --git a/example/a guide to GPUKernelContest/src/topk_pair_algorithm.maca b/example/a guide to GPUKernelContest/src/topk_pair_algorithm.maca new file mode 100644 index 0000000..92ff853 --- /dev/null +++ b/example/a guide to GPUKernelContest/src/topk_pair_algorithm.maca @@ -0,0 +1,317 @@ +#include "test_utils.h" +#include "performance_utils.h" +#include "yaml_reporter.h" +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// 实现标记宏 - 参赛者修改实现时请将此宏设为0 +// ============================================================================ +#ifndef USE_DEFAULT_REF_IMPL +#define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 +#endif + +#if USE_DEFAULT_REF_IMPL +#include +#include +#include +#include +#include +#include +#endif + +static const int TOPK_VALUES[] = {32, 50, 100, 256, 1024}; +static const int NUM_TOPK_VALUES = sizeof(TOPK_VALUES) / sizeof(TOPK_VALUES[0]); + +// ============================================================================ +// TopkPair算法实现接口 +// 参赛者需要替换Thrust实现为自己的高性能kernel +// ============================================================================ + +template +class TopkPairAlgorithm { +public: + // 主要接口函数 - 参赛者需要实现这个函数 + void topk(const KeyType* d_keys_in, KeyType* d_keys_out, + const ValueType* d_values_in, ValueType* d_values_out, + int num_items, int k, bool descending) { + +#if !USE_DEFAULT_REF_IMPL + // ======================================== + // 参赛者自定义实现区域 + // ======================================== + + // TODO: 参赛者在此实现自己的高性能TopK算法 + + // 示例:参赛者可以调用多个自定义kernel + // TopkKernel1<<>>(d_keys_in, d_values_in, temp_results, num_items, k); + // TopkKernel2<<>>(temp_results, d_keys_out, d_values_out, k, descending); +#else + // ======================================== + // 默认基准实现 + // ======================================== + + KeyType* temp_keys; + ValueType* temp_values; + MACA_CHECK(mcMalloc(&temp_keys, num_items * sizeof(KeyType))); + MACA_CHECK(mcMalloc(&temp_values, num_items * sizeof(ValueType))); + + MACA_CHECK(mcMemcpy(temp_keys, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice)); + MACA_CHECK(mcMemcpy(temp_values, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice)); + + auto key_ptr = thrust::device_pointer_cast(temp_keys); + auto value_ptr = thrust::device_pointer_cast(temp_values); + + // 由于greater和less是不同类型,需要分别调用 + if (descending) { + thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater()); + } else { + thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less()); + } + + MACA_CHECK(mcMemcpy(d_keys_out, temp_keys, k * sizeof(KeyType), mcMemcpyDeviceToDevice)); + MACA_CHECK(mcMemcpy(d_values_out, temp_values, k * sizeof(ValueType), mcMemcpyDeviceToDevice)); + + mcFree(temp_keys); + mcFree(temp_values); +#endif + } + + // 获取当前实现状态 + static const char* getImplementationStatus() { +#if USE_DEFAULT_REF_IMPL + return "DEFAULT_REF_IMPL"; +#else + return "CUSTOM_IMPL"; +#endif + } + +private: + // 参赛者可以在这里添加辅助函数和成员变量 + // 例如:分块大小、临时缓冲区、多流处理等 +}; + +// ============================================================================ +// 测试和性能评估 +// ============================================================================ + +bool testCorrectness() { + std::cout << "TopkPair 正确性测试..." << std::endl; + TestDataGenerator generator; + TopkPairAlgorithm algorithm; + + int size = 10000; + auto keys = generator.generateRandomFloats(size); + auto values = generator.generateRandomUint32(size); + + // 分配GPU内存 + float *d_keys_in, *d_keys_out; + uint32_t *d_values_in, *d_values_out; + + MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); + + MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); + MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); + + bool allPassed = true; + + // 测试不同k值 + for (int ki = 0; ki < NUM_TOPK_VALUES && ki < 4; ki++) { // 限制测试范围 + int k = TOPK_VALUES[ki]; + if (k > size) continue; + + std::cout << " 测试 k=" << k << std::endl; + + MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float))); + MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t))); + + for (bool descending : {false, true}) { + std::cout << " " << (descending ? "降序" : "升序") << " TopK..." << std::endl; + + // CPU参考结果 + std::vector cpu_keys_out; + std::vector cpu_values_out; + cpuTopkPair(keys, values, cpu_keys_out, cpu_values_out, k, descending); + + // GPU算法结果 + algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); + + // 获取结果 + std::vector gpu_keys_out(k); + std::vector gpu_values_out(k); + MACA_CHECK(mcMemcpy(gpu_keys_out.data(), d_keys_out, k * sizeof(float), mcMemcpyDeviceToHost)); + MACA_CHECK(mcMemcpy(gpu_values_out.data(), d_values_out, k * sizeof(uint32_t), mcMemcpyDeviceToHost)); + + // 验证结果 + bool keysMatch = compareArrays(cpu_keys_out, gpu_keys_out, 1e-5); + bool valuesMatch = compareArrays(cpu_values_out, gpu_values_out); + + if (!keysMatch || !valuesMatch) { + std::cout << " 失败: 结果不匹配" << std::endl; + allPassed = false; + } else { + std::cout << " 通过" << std::endl; + } + } + + mcFree(d_keys_out); + mcFree(d_values_out); + } + + // 清理内存 + mcFree(d_keys_in); + mcFree(d_values_in); + + return allPassed; +} + +void benchmarkPerformance() { + std::cout << "\nTopkPair 性能测试..." << std::endl; + std::cout << "数据类型: " << std::endl; + std::cout << "计算公式:" << std::endl; + std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; + + TestDataGenerator generator; + PerformanceMeter meter; + TopkPairAlgorithm algorithm; + + const int WARMUP_ITERATIONS = 5; + const int BENCHMARK_ITERATIONS = 10; + + // 用于YAML报告的数据收集 + std::vector> perf_data; + + // 针对不同数据规模测试 + for (int size_idx = 0; size_idx < NUM_TEST_SIZES; size_idx++) { + int size = TEST_SIZES[size_idx]; + std::cout << "\n数据规模: " << size << std::endl; + std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" + << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; + std::cout << std::string(74, '-') << std::endl; + + auto keys = generator.generateRandomFloats(size); + auto values = generator.generateRandomUint32(size); + + // 分配GPU内存 + float *d_keys_in; + uint32_t *d_values_in; + + MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); + MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); + + MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); + MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); + + for (int ki = 0; ki < NUM_TOPK_VALUES; ki++) { + int k = TOPK_VALUES[ki]; + if (k > size) continue; + + float *d_keys_out; + uint32_t *d_values_out; + MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float))); + MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t))); + + float asc_time = 0, desc_time = 0; + + for (bool descending : {false, true}) { + // Warmup阶段 + for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { + algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); + } + + // 正式测试阶段 + float total_time = 0; + for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { + meter.startTiming(); + algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); + total_time += meter.stopTiming(); + } + + float avg_time = total_time / BENCHMARK_ITERATIONS; + if (descending) { + desc_time = avg_time; + } else { + asc_time = avg_time; + } + } + + // 计算性能指标 + auto asc_metrics = PerformanceCalculator::calculateTopkPair(size, k, asc_time); + auto desc_metrics = PerformanceCalculator::calculateTopkPair(size, k, desc_time); + + // 显示性能数据 + PerformanceDisplay::printTopkPairData(k, asc_time, desc_time, asc_metrics, desc_metrics); + + // 收集YAML报告数据 + auto entry = YAMLPerformanceReporter::createEntry(); + entry["data_size"] = std::to_string(size); + entry["k_value"] = std::to_string(k); + entry["asc_time_ms"] = std::to_string(asc_time); + entry["desc_time_ms"] = std::to_string(desc_time); + entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps); + entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps); + entry["key_type"] = "float"; + entry["value_type"] = "uint32_t"; + perf_data.push_back(entry); + + mcFree(d_keys_out); + mcFree(d_values_out); + } + + mcFree(d_keys_in); + mcFree(d_values_in); + } + + // 生成YAML性能报告 + YAMLPerformanceReporter::generateTopkPairYAML(perf_data, "topk_pair_performance.yaml"); + PerformanceDisplay::printSavedMessage("topk_pair_performance.yaml"); +} + +// ============================================================================ +// 主函数 +// ============================================================================ +int main(int argc, char* argv[]) { + std::cout << "=== TopkPair 算法测试 ===" << std::endl; + + // 检查参数 + std::string mode = "all"; + if (argc > 1) { + mode = argv[1]; + } + + bool correctness_passed = true; + bool performance_completed = true; + + try { + if (mode == "correctness" || mode == "all") { + correctness_passed = testCorrectness(); + } + + if (mode == "performance" || mode == "all") { + if (correctness_passed || mode == "performance") { + benchmarkPerformance(); + } else { + std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; + performance_completed = false; + } + } + + std::cout << "\n=== 测试完成 ===" << std::endl; + std::cout << "实现状态: " << TopkPairAlgorithm::getImplementationStatus() << std::endl; + if (mode == "all") { + std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; + std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; + } + + return correctness_passed ? 0 : 1; + + } catch (const std::exception& e) { + std::cerr << "测试出错: " << e.what() << std::endl; + return 1; + } +} diff --git a/example/a guide to GPUKernelContest/src/yaml_reporter.h b/example/a guide to GPUKernelContest/src/yaml_reporter.h new file mode 100644 index 0000000..c39d5c3 --- /dev/null +++ b/example/a guide to GPUKernelContest/src/yaml_reporter.h @@ -0,0 +1,154 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +// ============================================================================ +// YAML性能报告生成器 +// ============================================================================ + +class YAMLPerformanceReporter { +public: + struct PerformanceData { + std::string algorithm; + std::string input_type; + std::string output_type; + std::string key_type; + std::string value_type; + std::vector> metrics; + }; + + // 创建性能数据条目 + static std::map createEntry() { + return std::map(); + } + + // 生成ReduceSum性能YAML + static void generateReduceSumYAML(const std::vector>& perf_data, + const std::string& filename = "reduce_sum_performance.yaml") { + std::ofstream yaml_file(filename); + + // 写入头部信息 + writeHeader(yaml_file, "ReduceSum算法性能测试结果"); + + // 算法信息 + yaml_file << "algorithm: \"ReduceSum\"\n"; + yaml_file << "data_types:\n"; + yaml_file << " input: \"float\"\n"; + yaml_file << " output: \"float\"\n"; + + // 计算公式 + yaml_file << "formulas:\n"; + yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; + + // 性能数据 + yaml_file << "performance_data:\n"; + for (const auto& data : perf_data) { + yaml_file << " - data_size: " << data.at("data_size") << "\n"; + yaml_file << " time_ms: " << formatFloat(data.at("time_ms")) << "\n"; + yaml_file << " throughput_gps: " << formatFloat(data.at("throughput_gps")) << "\n"; + yaml_file << " data_type: \"" << data.at("data_type") << "\"\n"; + } + + yaml_file.close(); + } + + // 生成SortPair性能YAML + static void generateSortPairYAML(const std::vector>& perf_data, + const std::string& filename = "sort_pair_performance.yaml") { + std::ofstream yaml_file(filename); + + // 写入头部信息 + writeHeader(yaml_file, "SortPair算法性能测试结果"); + + // 算法信息 + yaml_file << "algorithm: \"SortPair\"\n"; + yaml_file << "data_types:\n"; + yaml_file << " key_type: \"float\"\n"; + yaml_file << " value_type: \"uint32_t\"\n"; + + // 计算公式 + yaml_file << "formulas:\n"; + yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; + + // 性能数据 + yaml_file << "performance_data:\n"; + for (const auto& data : perf_data) { + yaml_file << " - data_size: " << data.at("data_size") << "\n"; + yaml_file << " ascending:\n"; + yaml_file << " time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n"; + yaml_file << " throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n"; + yaml_file << " descending:\n"; + yaml_file << " time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n"; + yaml_file << " throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n"; + yaml_file << " key_type: \"" << data.at("key_type") << "\"\n"; + yaml_file << " value_type: \"" << data.at("value_type") << "\"\n"; + } + + yaml_file.close(); + } + + // 生成TopkPair性能YAML + static void generateTopkPairYAML(const std::vector>& perf_data, + const std::string& filename = "topk_pair_performance.yaml") { + std::ofstream yaml_file(filename); + + // 写入头部信息 + writeHeader(yaml_file, "TopkPair算法性能测试结果"); + + // 算法信息 + yaml_file << "algorithm: \"TopkPair\"\n"; + yaml_file << "data_types:\n"; + yaml_file << " key_type: \"float\"\n"; + yaml_file << " value_type: \"uint32_t\"\n"; + + // 计算公式 + yaml_file << "formulas:\n"; + yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; + + // 性能数据 + yaml_file << "performance_data:\n"; + for (const auto& data : perf_data) { + yaml_file << " - data_size: " << data.at("data_size") << "\n"; + yaml_file << " k_value: " << data.at("k_value") << "\n"; + yaml_file << " ascending:\n"; + yaml_file << " time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n"; + yaml_file << " throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n"; + yaml_file << " descending:\n"; + yaml_file << " time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n"; + yaml_file << " throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n"; + yaml_file << " key_type: \"" << data.at("key_type") << "\"\n"; + yaml_file << " value_type: \"" << data.at("value_type") << "\"\n"; + } + + yaml_file.close(); + } + +private: + // 写入YAML文件头部 + static void writeHeader(std::ofstream& file, const std::string& title) { + file << "# " << title << "\n"; + file << "# 生成时间: "; + + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + file << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S"); + file << "\n\n"; + } + + // 格式化浮点数 + static std::string formatFloat(const std::string& value) { + try { + double d = std::stod(value); + std::ostringstream oss; + oss << std::fixed << std::setprecision(6) << d; + return oss.str(); + } catch (...) { + return value; + } + } +}; \ No newline at end of file