| @@ -0,0 +1,6 @@ | |||
| .DS_Store | |||
| *.bak | |||
| *.pyc | |||
| *.o | |||
| */build/ | |||
| cp_template/*.yaml | |||
| @@ -32,7 +32,52 @@ | |||
| --- | |||
| ## 📥 如何参与提交? | |||
| ## 🚀 快速上手 | |||
| 本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。为了快速让参赛者进入比赛状态,可选择实现三个核心算法的高性能版本: | |||
| - **ReduceSum**: 高精度归约求和 | |||
| - **SortPair**: 键值对稳定排序 | |||
| - **TopkPair**: 键值对TopK选择 | |||
| ### 📥 | |||
| ### 编译和测试 | |||
| #### 1. 全量编译和运行 | |||
| ```bash | |||
| # 编译并运行所有算法测试(默认行为) | |||
| ./run.sh | |||
| # 仅编译所有算法,不运行测试 | |||
| ./run.sh --build-only | |||
| # 编译并运行单个算法测试 | |||
| ./run.sh --run_reduce # ReduceSum算法 | |||
| ./run.sh --run_sort # SortPair算法 | |||
| ./run.sh --run_topk # TopkPair算法 | |||
| ``` | |||
| #### 2. 单独编译和运行 | |||
| ```bash | |||
| # 编译并运行ReduceSum算法(默认行为) | |||
| ./run_reduce_sum.sh | |||
| # 仅编译ReduceSum算法,不运行测试 | |||
| ./run_reduce_sum.sh --build-only | |||
| # 编译并运行SortPair正确性测试 | |||
| ./run_sort_pair.sh --run correctness | |||
| # 编译并运行TopkPair性能测试 | |||
| ./run_topk_pair.sh --run performance | |||
| ``` | |||
| #### 3. 手动运行测试 | |||
| ```bash | |||
| ./build/test_reducesum [correctness|performance|all] | |||
| ./build/test_sortpair [correctness|performance|all] | |||
| ./build/test_topkpair [correctness|performance|all] | |||
| ``` | |||
| ### ✅ 参赛要求: | |||
| - 提交内容必须可以在沐曦自研 GPU **曦云 C500** 上运行。 | |||
| @@ -72,7 +117,7 @@ | |||
| ## 🏅 排名规则 | |||
| - 比赛周期:2 个月 | |||
| - 比赛周期:2 个月 | |||
| - 排名按累计得分排序,取前 12 名! | |||
| 若得分相同: | |||
| @@ -0,0 +1,26 @@ | |||
| # ReduceSum算法性能测试结果 | |||
| # 生成时间: 2025-09-03 22:34:18 | |||
| algorithm: "ReduceSum" | |||
| data_types: | |||
| input: "float" | |||
| output: "float" | |||
| formulas: | |||
| throughput: "elements / time(s) / 1e9 (G/s)" | |||
| performance_data: | |||
| - data_size: 1000000 | |||
| time_ms: 0.048717 | |||
| throughput_gps: 20.526799 | |||
| data_type: "float" | |||
| - data_size: 134217728 | |||
| time_ms: 0.402560 | |||
| throughput_gps: 333.410496 | |||
| data_type: "float" | |||
| - data_size: 536870912 | |||
| time_ms: 1.346586 | |||
| throughput_gps: 398.690510 | |||
| data_type: "float" | |||
| - data_size: 1073741824 | |||
| time_ms: 2.639513 | |||
| throughput_gps: 406.795353 | |||
| data_type: "float" | |||
| @@ -0,0 +1,46 @@ | |||
| # SortPair算法性能测试结果 | |||
| # 生成时间: 2025-09-03 22:37:18 | |||
| algorithm: "SortPair" | |||
| data_types: | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| formulas: | |||
| throughput: "elements / time(s) / 1e9 (G/s)" | |||
| performance_data: | |||
| - data_size: 1000000 | |||
| ascending: | |||
| time_ms: 0.351488 | |||
| throughput_gps: 2.845047 | |||
| descending: | |||
| time_ms: 0.343270 | |||
| throughput_gps: 2.913155 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 134217728 | |||
| ascending: | |||
| time_ms: 22.273815 | |||
| throughput_gps: 6.025808 | |||
| descending: | |||
| time_ms: 22.494003 | |||
| throughput_gps: 5.966823 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 536870912 | |||
| ascending: | |||
| time_ms: 88.856277 | |||
| throughput_gps: 6.042014 | |||
| descending: | |||
| time_ms: 89.913918 | |||
| throughput_gps: 5.970943 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1073741824 | |||
| ascending: | |||
| time_ms: 181.409576 | |||
| throughput_gps: 5.918882 | |||
| descending: | |||
| time_ms: 183.428955 | |||
| throughput_gps: 5.853720 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| @@ -0,0 +1,210 @@ | |||
| # TopkPair算法性能测试结果 | |||
| # 生成时间: 2025-09-03 22:40:54 | |||
| algorithm: "TopkPair" | |||
| data_types: | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| formulas: | |||
| throughput: "elements / time(s) / 1e9 (G/s)" | |||
| performance_data: | |||
| - data_size: 1000000 | |||
| k_value: 32 | |||
| ascending: | |||
| time_ms: 0.402509 | |||
| throughput_gps: 2.484418 | |||
| descending: | |||
| time_ms: 0.416307 | |||
| throughput_gps: 2.402072 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1000000 | |||
| k_value: 50 | |||
| ascending: | |||
| time_ms: 0.404787 | |||
| throughput_gps: 2.470434 | |||
| descending: | |||
| time_ms: 0.414669 | |||
| throughput_gps: 2.411563 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1000000 | |||
| k_value: 100 | |||
| ascending: | |||
| time_ms: 0.398336 | |||
| throughput_gps: 2.510443 | |||
| descending: | |||
| time_ms: 0.408320 | |||
| throughput_gps: 2.449060 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1000000 | |||
| k_value: 256 | |||
| ascending: | |||
| time_ms: 0.410752 | |||
| throughput_gps: 2.434559 | |||
| descending: | |||
| time_ms: 0.403379 | |||
| throughput_gps: 2.479057 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1000000 | |||
| k_value: 1024 | |||
| ascending: | |||
| time_ms: 0.391091 | |||
| throughput_gps: 2.556949 | |||
| descending: | |||
| time_ms: 0.391142 | |||
| throughput_gps: 2.556613 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 134217728 | |||
| k_value: 32 | |||
| ascending: | |||
| time_ms: 22.394062 | |||
| throughput_gps: 5.993452 | |||
| descending: | |||
| time_ms: 22.263729 | |||
| throughput_gps: 6.028538 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 134217728 | |||
| k_value: 50 | |||
| ascending: | |||
| time_ms: 22.379187 | |||
| throughput_gps: 5.997435 | |||
| descending: | |||
| time_ms: 22.228352 | |||
| throughput_gps: 6.038132 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 134217728 | |||
| k_value: 100 | |||
| ascending: | |||
| time_ms: 22.436581 | |||
| throughput_gps: 5.982094 | |||
| descending: | |||
| time_ms: 22.229326 | |||
| throughput_gps: 6.037868 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 134217728 | |||
| k_value: 256 | |||
| ascending: | |||
| time_ms: 22.463232 | |||
| throughput_gps: 5.974996 | |||
| descending: | |||
| time_ms: 22.319946 | |||
| throughput_gps: 6.013354 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 134217728 | |||
| k_value: 1024 | |||
| ascending: | |||
| time_ms: 22.468454 | |||
| throughput_gps: 5.973608 | |||
| descending: | |||
| time_ms: 22.335976 | |||
| throughput_gps: 6.009038 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 536870912 | |||
| k_value: 32 | |||
| ascending: | |||
| time_ms: 89.437294 | |||
| throughput_gps: 6.002763 | |||
| descending: | |||
| time_ms: 88.605972 | |||
| throughput_gps: 6.059083 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 536870912 | |||
| k_value: 50 | |||
| ascending: | |||
| time_ms: 89.460587 | |||
| throughput_gps: 6.001200 | |||
| descending: | |||
| time_ms: 88.546509 | |||
| throughput_gps: 6.063152 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 536870912 | |||
| k_value: 100 | |||
| ascending: | |||
| time_ms: 89.203011 | |||
| throughput_gps: 6.018529 | |||
| descending: | |||
| time_ms: 88.809097 | |||
| throughput_gps: 6.045224 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 536870912 | |||
| k_value: 256 | |||
| ascending: | |||
| time_ms: 89.500465 | |||
| throughput_gps: 5.998526 | |||
| descending: | |||
| time_ms: 88.743912 | |||
| throughput_gps: 6.049665 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 536870912 | |||
| k_value: 1024 | |||
| ascending: | |||
| time_ms: 89.405357 | |||
| throughput_gps: 6.004908 | |||
| descending: | |||
| time_ms: 88.446083 | |||
| throughput_gps: 6.070036 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1073741824 | |||
| k_value: 32 | |||
| ascending: | |||
| time_ms: 182.233307 | |||
| throughput_gps: 5.892127 | |||
| descending: | |||
| time_ms: 181.076950 | |||
| throughput_gps: 5.929754 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1073741824 | |||
| k_value: 50 | |||
| ascending: | |||
| time_ms: 182.273239 | |||
| throughput_gps: 5.890836 | |||
| descending: | |||
| time_ms: 180.944550 | |||
| throughput_gps: 5.934093 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1073741824 | |||
| k_value: 100 | |||
| ascending: | |||
| time_ms: 182.374191 | |||
| throughput_gps: 5.887576 | |||
| descending: | |||
| time_ms: 181.277100 | |||
| throughput_gps: 5.923207 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1073741824 | |||
| k_value: 256 | |||
| ascending: | |||
| time_ms: 182.349457 | |||
| throughput_gps: 5.888374 | |||
| descending: | |||
| time_ms: 181.248199 | |||
| throughput_gps: 5.924152 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| - data_size: 1073741824 | |||
| k_value: 1024 | |||
| ascending: | |||
| time_ms: 182.378326 | |||
| throughput_gps: 5.887442 | |||
| descending: | |||
| time_ms: 181.025803 | |||
| throughput_gps: 5.931430 | |||
| key_type: "float" | |||
| value_type: "uint32_t" | |||
| @@ -1,59 +1,12 @@ | |||
| # GPU 高性能并行计算算法优化竞赛 | |||
| ## 🎯 竞赛概述 | |||
| 本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。参赛者可选择实现三个核心算法的高性能版本: | |||
| - **ReduceSum**: 高精度归约求和 | |||
| - **SortPair**: 键值对稳定排序 | |||
| - **TopkPair**: 键值对TopK选择 | |||
| ## 🚀 快速开始 | |||
| ### 编译和测试 | |||
| #### 1. 全量编译和运行 | |||
| ```bash | |||
| # 编译并运行所有算法测试(默认行为) | |||
| ./build_and_run.sh | |||
| # 仅编译所有算法,不运行测试 | |||
| ./build_and_run.sh --build-only | |||
| # 编译并运行单个算法测试 | |||
| ./build_and_run.sh --run_reduce # ReduceSum算法 | |||
| ./build_and_run.sh --run_sort # SortPair算法 | |||
| ./build_and_run.sh --run_topk # TopkPair算法 | |||
| ``` | |||
| #### 2. 单独编译和运行 | |||
| ```bash | |||
| # 编译并运行ReduceSum算法(默认行为) | |||
| ./build_and_run_reduce_sum.sh | |||
| # 仅编译ReduceSum算法,不运行测试 | |||
| ./build_and_run_reduce_sum.sh --build-only | |||
| # 编译并运行SortPair正确性测试 | |||
| ./build_and_run_sort_pair.sh --run correctness | |||
| # 编译并运行TopkPair性能测试 | |||
| ./build_and_run_topk_pair.sh --run performance | |||
| ``` | |||
| #### 3. 手动运行测试 | |||
| ```bash | |||
| ./build/test_reducesum [correctness|performance|all] | |||
| ./build/test_sortpair [correctness|performance|all] | |||
| ./build/test_topkpair [correctness|performance|all] | |||
| ``` | |||
| ## 📝 参赛指南 | |||
| ### 实现位置 | |||
| 参赛者需要在以下文件中替换Thrust实现: | |||
| - `src/reduce_sum_algorithm.maca` - 替换Thrust归约求和 | |||
| - `src/sort_pair_algorithm.maca` - 替换Thrust稳定排序 | |||
| - `src/topk_pair_algorithm.maca` - 替换Thrust TopK选择 | |||
| - `reduce_sum_algorithm.maca` - 替换Thrust归约求和 | |||
| - `sort_pair_algorithm.maca` - 替换Thrust稳定排序 | |||
| - `topk_pair_algorithm.maca` - 替换Thrust TopK选择 | |||
| ### 算法要求 | |||
| 见competition_parallel_algorithms.md | |||
| @@ -92,25 +45,21 @@ | |||
| - 各数据规模的详细性能数据 | |||
| - 升序/降序分别统计(适用时) | |||
| ## 📁 项目结构 | |||
| ## 📁 提交内容结构 | |||
| ``` | |||
| ├── build_and_run.sh # 统一编译和运行脚本(默认编译+运行所有算法) | |||
| ├── build_common.sh # 公共编译配置和函数 | |||
| ├── build_and_run_reduce_sum.sh # ReduceSum独立编译和运行脚本 | |||
| ├── build_and_run_sort_pair.sh # SortPair独立编译和运行脚本 | |||
| ├── build_and_run_topk_pair.sh # TopkPair独立编译和运行脚本 | |||
| ├── run.sh # 统一编译和运行脚本(默认编译+运行所有算法) | |||
| ├── competition_parallel_algorithms.md # 详细题目说明 | |||
| ├── src/ # 算法实现和工具文件 | |||
| │ ├── reduce_sum_algorithm.maca # 1. ReduceSum测试程序 | |||
| │ ├── sort_pair_algorithm.maca # 2. SortPair测试程序 | |||
| │ ├── topk_pair_algorithm.maca # 3. TopkPair测试程序 | |||
| │── reduce_sum_algorithm.maca # 1. ReduceSum测试程序 | |||
| │── sort_pair_algorithm.maca # 2. SortPair测试程序 | |||
| │── topk_pair_algorithm.maca # 3. TopkPair测试程序 | |||
| ├── utils/ # 工具文件 | |||
| │ ├── test_utils.h # 测试工具和CPU参考实现 | |||
| │ ├── yaml_reporter.h # YAML性能报告生成器 | |||
| │ └── performance_utils.h # 性能测试工具 | |||
| ├── final_results/reduce_sum_results.yaml #ReduceSum性能数据 | |||
| ├── final_results/sort_pair_results.yaml #替换Thrust稳定排序 | |||
| └── final_results/topk_pair_results.yaml #TopkPair性能数据 | |||
| ├── reduce_sum_results.yaml #ReduceSum性能数据 | |||
| ├── sort_pair_results.yaml #替换Thrust稳定排序 | |||
| └── topk_pair_results.yaml #TopkPair性能数据 | |||
| ``` | |||
| ## 🔧 开发工具 | |||
| @@ -134,7 +83,7 @@ mxcc -O3 -std=c++17 --extended-lambda -Isrc | |||
| |--------|--------|------| | |||
| | `COMPILER` | `mxcc` | CUDA编译器路径 | | |||
| | `COMPILER_FLAGS` | `-O3 -std=c++17 --extended-lambda` | 编译标志 | | |||
| | `INCLUDE_DIR` | `src` | 头文件目录 | | |||
| | `HEADER_DIR` | `utils` | 头文件目录 | | |||
| | `BUILD_DIR` | `build` | 构建输出目录 | | |||
| ### 调试模式 | |||
| @@ -1,11 +1,11 @@ | |||
| # 题目: | |||
| # 样例赛题说明 | |||
| ## GPU高性能并行计算算法优化 | |||
| 要求参赛者通过一个或多个global kernel 函数(允许配套 device 辅助函数),实现高性能算法。 | |||
| 在正确性、稳定性前提下,比拼算法性能。 | |||
| # 1. ReduceSum算法优化 | |||
| ```cpp | |||
| template <typename InputT = float, typename OutputT = float> | |||
| @@ -23,14 +23,12 @@ public: | |||
| * 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 | |||
| * 假定输入d\_in数据量为num\_items | |||
| 注意事项 | |||
| * 累计误差不大于cpu double golden基准的0.5% | |||
| * 注意针对NAN和INF等异常值的处理 | |||
| 加分项 | |||
| * 使用tensor core计算reduce | |||
| @@ -62,14 +60,11 @@ public: | |||
| * 需要校验结果正确性 | |||
| * 结果必须稳定排序 | |||
| 加分项 | |||
| * 支持其他不同数据类型的排序,如half、double、int32_t等 | |||
| * 覆盖更全面的数据范围,提供良好稳定的性能表现 | |||
| # 3. Topk Pair算法优化 | |||
| ```cpp | |||
| template <typename KeyType, typename ValueType> | |||
| @@ -95,7 +90,6 @@ public: | |||
| * 结果必须稳定排序 | |||
| 加分项 | |||
| * 支持其他不同数据类型的键值对,实现类型通用算法 | |||
| @@ -36,11 +36,11 @@ COMPILER=${COMPILER:-mxcc} | |||
| COMPILER_FLAGS=${COMPILER_FLAGS:-"-O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST"} | |||
| # ***** 这里是关键修改点1:头文件目录 ***** | |||
| # 现在头文件在 includes/ 目录下 | |||
| # 现在头文件在 utils/ 目录下 | |||
| HEADER_DIR=${HEADER_DIR:-utils} | |||
| # ***** 这里是关键修改点2:源文件目录 ***** | |||
| # 现在源文件在 algorithms/ 目录下 | |||
| # 现在源文件在 ./ 目录下 | |||
| SOURCE_CODE_DIR=${SOURCE_CODE_DIR:-} | |||
| BUILD_DIR=${BUILD_DIR:-build} | |||