[WIP] 重构样板赛题

2 months ago · 79e4fd6ab1
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,6 @@
 .DS_Store
 *.bak
 *.pyc
 *.o
 */build/
 cp_template/*.yaml
--- a/README.md
+++ b/README.md
@@ -32,7 +32,52 @@

 ---

 ## 📥 如何参与提交？
 ## 🚀 快速上手

 本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。为了快速让参赛者进入比赛状态，可选择实现三个核心算法的高性能版本：
 - **ReduceSum**: 高精度归约求和
 - **SortPair**: 键值对稳定排序
 - **TopkPair**: 键值对TopK选择

 ### 📥 

 ### 编译和测试

 #### 1. 全量编译和运行
 ```bash
 # 编译并运行所有算法测试（默认行为）
 ./run.sh

 # 仅编译所有算法，不运行测试
 ./run.sh --build-only

 # 编译并运行单个算法测试
 ./run.sh --run_reduce   # ReduceSum算法
 ./run.sh --run_sort     # SortPair算法
 ./run.sh --run_topk     # TopkPair算法
 ```

 #### 2. 单独编译和运行
 ```bash
 # 编译并运行ReduceSum算法（默认行为）
 ./run_reduce_sum.sh

 # 仅编译ReduceSum算法，不运行测试
 ./run_reduce_sum.sh --build-only

 # 编译并运行SortPair正确性测试
 ./run_sort_pair.sh --run correctness

 # 编译并运行TopkPair性能测试
 ./run_topk_pair.sh --run performance
 ```

 #### 3. 手动运行测试
 ```bash
 ./build/test_reducesum [correctness|performance|all]
 ./build/test_sortpair [correctness|performance|all]
 ./build/test_topkpair [correctness|performance|all]
 ```

 ### ✅ 参赛要求：
 - 提交内容必须可以在沐曦自研 GPU **曦云 C500** 上运行。
@@ -72,7 +117,7 @@

 ## 🏅 排名规则

 - 比赛周期：2 个月  
 - 比赛周期：2 个月
 - 排名按累计得分排序，取前 12 名！

 若得分相同：
--- a/S1/ICTN0N/build/test_reducesum
+++ b/S1/ICTN0N/build/test_reducesum
--- a/S1/ICTN0N/build/test_sortpair
+++ b/S1/ICTN0N/build/test_sortpair
--- a/S1/ICTN0N/build/test_topkpair
+++ b/S1/ICTN0N/build/test_topkpair
--- a/S1/ICTN0N/reduce_sum_performance.yaml
+++ b/S1/ICTN0N/reduce_sum_performance.yaml
@@ -0,0 +1,26 @@
 # ReduceSum算法性能测试结果
 # 生成时间: 2025-09-03 22:34:18

 algorithm: "ReduceSum"
 data_types:
  input: "float"
  output: "float"
 formulas:
  throughput: "elements / time(s) / 1e9 (G/s)"
 performance_data:
  - data_size: 1000000
    time_ms: 0.048717
    throughput_gps: 20.526799
    data_type: "float"
  - data_size: 134217728
    time_ms: 0.402560
    throughput_gps: 333.410496
    data_type: "float"
  - data_size: 536870912
    time_ms: 1.346586
    throughput_gps: 398.690510
    data_type: "float"
  - data_size: 1073741824
    time_ms: 2.639513
    throughput_gps: 406.795353
    data_type: "float"
--- a/S1/ICTN0N/sort_pair_performance.yaml
+++ b/S1/ICTN0N/sort_pair_performance.yaml
@@ -0,0 +1,46 @@
 # SortPair算法性能测试结果
 # 生成时间: 2025-09-03 22:37:18

 algorithm: "SortPair"
 data_types:
  key_type: "float"
  value_type: "uint32_t"
 formulas:
  throughput: "elements / time(s) / 1e9 (G/s)"
 performance_data:
  - data_size: 1000000
    ascending:
      time_ms: 0.351488
      throughput_gps: 2.845047
    descending:
      time_ms: 0.343270
      throughput_gps: 2.913155
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 134217728
    ascending:
      time_ms: 22.273815
      throughput_gps: 6.025808
    descending:
      time_ms: 22.494003
      throughput_gps: 5.966823
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 536870912
    ascending:
      time_ms: 88.856277
      throughput_gps: 6.042014
    descending:
      time_ms: 89.913918
      throughput_gps: 5.970943
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1073741824
    ascending:
      time_ms: 181.409576
      throughput_gps: 5.918882
    descending:
      time_ms: 183.428955
      throughput_gps: 5.853720
    key_type: "float"
    value_type: "uint32_t"
--- a/S1/ICTN0N/topk_pair_performance.yaml
+++ b/S1/ICTN0N/topk_pair_performance.yaml
@@ -0,0 +1,210 @@
 # TopkPair算法性能测试结果
 # 生成时间: 2025-09-03 22:40:54

 algorithm: "TopkPair"
 data_types:
  key_type: "float"
  value_type: "uint32_t"
 formulas:
  throughput: "elements / time(s) / 1e9 (G/s)"
 performance_data:
  - data_size: 1000000
    k_value: 32
    ascending:
      time_ms: 0.402509
      throughput_gps: 2.484418
    descending:
      time_ms: 0.416307
      throughput_gps: 2.402072
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1000000
    k_value: 50
    ascending:
      time_ms: 0.404787
      throughput_gps: 2.470434
    descending:
      time_ms: 0.414669
      throughput_gps: 2.411563
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1000000
    k_value: 100
    ascending:
      time_ms: 0.398336
      throughput_gps: 2.510443
    descending:
      time_ms: 0.408320
      throughput_gps: 2.449060
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1000000
    k_value: 256
    ascending:
      time_ms: 0.410752
      throughput_gps: 2.434559
    descending:
      time_ms: 0.403379
      throughput_gps: 2.479057
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1000000
    k_value: 1024
    ascending:
      time_ms: 0.391091
      throughput_gps: 2.556949
    descending:
      time_ms: 0.391142
      throughput_gps: 2.556613
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 134217728
    k_value: 32
    ascending:
      time_ms: 22.394062
      throughput_gps: 5.993452
    descending:
      time_ms: 22.263729
      throughput_gps: 6.028538
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 134217728
    k_value: 50
    ascending:
      time_ms: 22.379187
      throughput_gps: 5.997435
    descending:
      time_ms: 22.228352
      throughput_gps: 6.038132
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 134217728
    k_value: 100
    ascending:
      time_ms: 22.436581
      throughput_gps: 5.982094
    descending:
      time_ms: 22.229326
      throughput_gps: 6.037868
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 134217728
    k_value: 256
    ascending:
      time_ms: 22.463232
      throughput_gps: 5.974996
    descending:
      time_ms: 22.319946
      throughput_gps: 6.013354
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 134217728
    k_value: 1024
    ascending:
      time_ms: 22.468454
      throughput_gps: 5.973608
    descending:
      time_ms: 22.335976
      throughput_gps: 6.009038
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 536870912
    k_value: 32
    ascending:
      time_ms: 89.437294
      throughput_gps: 6.002763
    descending:
      time_ms: 88.605972
      throughput_gps: 6.059083
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 536870912
    k_value: 50
    ascending:
      time_ms: 89.460587
      throughput_gps: 6.001200
    descending:
      time_ms: 88.546509
      throughput_gps: 6.063152
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 536870912
    k_value: 100
    ascending:
      time_ms: 89.203011
      throughput_gps: 6.018529
    descending:
      time_ms: 88.809097
      throughput_gps: 6.045224
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 536870912
    k_value: 256
    ascending:
      time_ms: 89.500465
      throughput_gps: 5.998526
    descending:
      time_ms: 88.743912
      throughput_gps: 6.049665
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 536870912
    k_value: 1024
    ascending:
      time_ms: 89.405357
      throughput_gps: 6.004908
    descending:
      time_ms: 88.446083
      throughput_gps: 6.070036
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1073741824
    k_value: 32
    ascending:
      time_ms: 182.233307
      throughput_gps: 5.892127
    descending:
      time_ms: 181.076950
      throughput_gps: 5.929754
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1073741824
    k_value: 50
    ascending:
      time_ms: 182.273239
      throughput_gps: 5.890836
    descending:
      time_ms: 180.944550
      throughput_gps: 5.934093
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1073741824
    k_value: 100
    ascending:
      time_ms: 182.374191
      throughput_gps: 5.887576
    descending:
      time_ms: 181.277100
      throughput_gps: 5.923207
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1073741824
    k_value: 256
    ascending:
      time_ms: 182.349457
      throughput_gps: 5.888374
    descending:
      time_ms: 181.248199
      throughput_gps: 5.924152
    key_type: "float"
    value_type: "uint32_t"
  - data_size: 1073741824
    k_value: 1024
    ascending:
      time_ms: 182.378326
      throughput_gps: 5.887442
    descending:
      time_ms: 181.025803
      throughput_gps: 5.931430
    key_type: "float"
    value_type: "uint32_t"
--- a/cp_run_guide.md
+++ b/cp_run_guide.md
@@ -1,59 +1,12 @@
 # GPU 高性能并行计算算法优化竞赛

 ## 🎯 竞赛概述

 本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。参赛者可选择实现三个核心算法的高性能版本：
 - **ReduceSum**: 高精度归约求和
 - **SortPair**: 键值对稳定排序
 - **TopkPair**: 键值对TopK选择

 ## 🚀 快速开始

 ### 编译和测试

 #### 1. 全量编译和运行
 ```bash
 # 编译并运行所有算法测试（默认行为）
 ./build_and_run.sh

 # 仅编译所有算法，不运行测试
 ./build_and_run.sh --build-only

 # 编译并运行单个算法测试
 ./build_and_run.sh --run_reduce   # ReduceSum算法
 ./build_and_run.sh --run_sort     # SortPair算法
 ./build_and_run.sh --run_topk     # TopkPair算法
 ```

 #### 2. 单独编译和运行
 ```bash
 # 编译并运行ReduceSum算法（默认行为）
 ./build_and_run_reduce_sum.sh

 # 仅编译ReduceSum算法，不运行测试
 ./build_and_run_reduce_sum.sh --build-only

 # 编译并运行SortPair正确性测试
 ./build_and_run_sort_pair.sh --run correctness

 # 编译并运行TopkPair性能测试
 ./build_and_run_topk_pair.sh --run performance
 ```

 #### 3. 手动运行测试
 ```bash
 ./build/test_reducesum [correctness|performance|all]
 ./build/test_sortpair [correctness|performance|all]
 ./build/test_topkpair [correctness|performance|all]
 ```

 ## 📝 参赛指南

 ### 实现位置
 参赛者需要在以下文件中替换Thrust实现：
 - `src/reduce_sum_algorithm.maca` - 替换Thrust归约求和
 - `src/sort_pair_algorithm.maca` - 替换Thrust稳定排序
 - `src/topk_pair_algorithm.maca` - 替换Thrust TopK选择
 - `reduce_sum_algorithm.maca` - 替换Thrust归约求和
 - `sort_pair_algorithm.maca` - 替换Thrust稳定排序
 - `topk_pair_algorithm.maca` - 替换Thrust TopK选择

 ### 算法要求
 见competition_parallel_algorithms.md
@@ -92,25 +45,21 @@
 - 各数据规模的详细性能数据
 - 升序/降序分别统计（适用时）

 ## 📁 项目结构
 ## 📁 提交内容结构

 ```
 ├── build_and_run.sh                # 统一编译和运行脚本（默认编译+运行所有算法）
 ├── build_common.sh                  # 公共编译配置和函数
 ├── build_and_run_reduce_sum.sh     # ReduceSum独立编译和运行脚本
 ├── build_and_run_sort_pair.sh      # SortPair独立编译和运行脚本
 ├── build_and_run_topk_pair.sh      # TopkPair独立编译和运行脚本
 ├── run.sh                # 统一编译和运行脚本（默认编译+运行所有算法）
 ├── competition_parallel_algorithms.md  # 详细题目说明
 ├── src/                            # 算法实现和工具文件
 │   ├── reduce_sum_algorithm.maca     # 1. ReduceSum测试程序
 │   ├── sort_pair_algorithm.maca      # 2. SortPair测试程序
 │   ├── topk_pair_algorithm.maca      # 3. TopkPair测试程序
 │── reduce_sum_algorithm.maca     # 1. ReduceSum测试程序
 │── sort_pair_algorithm.maca      # 2. SortPair测试程序
 │── topk_pair_algorithm.maca      # 3. TopkPair测试程序
 ├── utils/                        # 工具文件
 │   ├── test_utils.h               # 测试工具和CPU参考实现
 │   ├── yaml_reporter.h            # YAML性能报告生成器
 │   └── performance_utils.h        # 性能测试工具
 ├── final_results/reduce_sum_results.yaml  #ReduceSum性能数据
 ├── final_results/sort_pair_results.yaml   #替换Thrust稳定排序
 └── final_results/topk_pair_results.yaml   #TopkPair性能数据         
 ├── reduce_sum_results.yaml        #ReduceSum性能数据
 ├── sort_pair_results.yaml         #替换Thrust稳定排序
 └── topk_pair_results.yaml         #TopkPair性能数据         
 ```

 ## 🔧 开发工具
@@ -134,7 +83,7 @@ mxcc -O3 -std=c++17 --extended-lambda -Isrc
 |--------|--------|------|
 | `COMPILER` | `mxcc` | CUDA编译器路径 |
 | `COMPILER_FLAGS` | `-O3 -std=c++17 --extended-lambda` | 编译标志 |
 | `INCLUDE_DIR` | `src` | 头文件目录 |
 | `HEADER_DIR` | `utils` | 头文件目录 |
 | `BUILD_DIR` | `build` | 构建输出目录 |

 ### 调试模式
--- a/cp_template/competition_parallel_algorithms.md
+++ b/cp_template/competition_parallel_algorithms.md
@@ -1,11 +1,11 @@
 # 题目：
 # 样例赛题说明

 ## GPU高性能并行计算算法优化

 要求参赛者通过一个或多个global kernel 函数（允许配套 device 辅助函数），实现高性能算法。

 在正确性、稳定性前提下，比拼算法性能。


 # 1. ReduceSum算法优化
 ```cpp
 template <typename InputT = float, typename OutputT = float>
@@ -23,14 +23,12 @@ public:
 * 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能
 * 假定输入d\_in数据量为num\_items


 注意事项

 * 累计误差不大于cpu double golden基准的0.5%
 * 注意针对NAN和INF等异常值的处理



 加分项

 * 使用tensor core计算reduce
@@ -62,14 +60,11 @@ public:
 * 需要校验结果正确性
 * 结果必须稳定排序


 加分项

 * 支持其他不同数据类型的排序，如half、double、int32_t等
 * 覆盖更全面的数据范围，提供良好稳定的性能表现



 # 3. Topk Pair算法优化
 ```cpp
 template <typename KeyType, typename ValueType>
@@ -95,7 +90,6 @@ public:

 * 结果必须稳定排序


 加分项

 * 支持其他不同数据类型的键值对，实现类型通用算法
--- a/cp_template/run.sh
+++ b/cp_template/run.sh
@@ -36,11 +36,11 @@ COMPILER=${COMPILER:-mxcc}
 COMPILER_FLAGS=${COMPILER_FLAGS:-"-O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST"} 

 # ***** 这里是关键修改点1：头文件目录 *****
 # 现在头文件在 includes/ 目录下
 # 现在头文件在 utils/ 目录下
 HEADER_DIR=${HEADER_DIR:-utils} 

 # ***** 这里是关键修改点2：源文件目录 *****
 # 现在源文件在 algorithms/ 目录下
 # 现在源文件在 ./ 目录下
 SOURCE_CODE_DIR=${SOURCE_CODE_DIR:-} 

 BUILD_DIR=${BUILD_DIR:-build}
--- a/cp_template/utils/performance_utils.h
+++ b/cp_template/utils/performance_utils.h
--- a/cp_template/utils/test_utils.h
+++ b/cp_template/utils/test_utils.h
--- a/cp_template/utils/yaml_reporter.h
+++ b/cp_template/utils/yaml_reporter.h