| @@ -0,0 +1,277 @@ | |||
| #include "test_utils.h" | |||
| #include "performance_utils.h" | |||
| #include "yaml_reporter.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| #include <iomanip> | |||
| // ============================================================================ | |||
| // 实现标记宏 - 参赛者修改实现时请将此宏设为0 | |||
| // ============================================================================ | |||
| #ifndef USE_DEFAULT_REF_IMPL | |||
| #define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 | |||
| #endif | |||
| #if USE_DEFAULT_REF_IMPL | |||
| #include <thrust/reduce.h> | |||
| #include <thrust/device_vector.h> | |||
| #include <thrust/execution_policy.h> | |||
| #include <thrust/functional.h> | |||
| #endif | |||
| // 误差容忍度 | |||
| constexpr double REDUCE_ERROR_TOLERANCE = 0.005; // 0.5% | |||
| // ============================================================================ | |||
| // ReduceSum算法实现接口 | |||
| // 参赛者需要替换Thrust实现为自己的高性能kernel | |||
| // ============================================================================ | |||
| template <typename InputT = float, typename OutputT = float> | |||
| class ReduceSumAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) { | |||
| #if !USE_DEFAULT_REF_IMPL | |||
| // ======================================== | |||
| // 参赛者自定义实现区域 | |||
| // ======================================== | |||
| // TODO: 参赛者在此实现自己的高性能归约算法 | |||
| // 示例:参赛者可以调用1个或多个自定义kernel | |||
| // blockReduceKernel<<<grid, block>>>(d_in, temp_results, num_items, init_value); | |||
| // finalReduceKernel<<<1, block>>>(temp_results, d_out, grid.x); | |||
| #else | |||
| // ======================================== | |||
| // 默认基准实现 | |||
| // ======================================== | |||
| auto input_ptr = thrust::device_pointer_cast(d_in); | |||
| auto output_ptr = thrust::device_pointer_cast(d_out); | |||
| // 直接使用thrust::reduce进行归约 | |||
| *output_ptr = thrust::reduce( | |||
| thrust::device, | |||
| input_ptr, | |||
| input_ptr + num_items, | |||
| static_cast<OutputT>(init_value) | |||
| ); | |||
| #endif | |||
| } | |||
| // 获取当前实现状态 | |||
| static const char* getImplementationStatus() { | |||
| #if USE_DEFAULT_REF_IMPL | |||
| return "DEFAULT_REF_IMPL"; | |||
| #else | |||
| return "CUSTOM_IMPL"; | |||
| #endif | |||
| } | |||
| private: | |||
| // 参赛者可以在这里添加辅助函数和成员变量 | |||
| // 例如:中间结果缓冲区、多阶段归约等 | |||
| }; | |||
| // ============================================================================ | |||
| // 测试和性能评估 | |||
| // ============================================================================ | |||
| bool testCorrectness() { | |||
| std::cout << "ReduceSum 正确性测试..." << std::endl; | |||
| TestDataGenerator generator; | |||
| ReduceSumAlgorithm<float, float> algorithm; | |||
| bool allPassed = true; | |||
| // 测试不同数据规模 | |||
| for (int i = 0; i < NUM_TEST_SIZES && i < 2; i++) { // 限制测试规模 | |||
| int size = std::min(TEST_SIZES[i], 10000); | |||
| std::cout << " 测试规模: " << size << std::endl; | |||
| // 测试普通数据 | |||
| { | |||
| auto data = generator.generateRandomFloats(size, -10.0f, 10.0f); | |||
| float init_value = 1.0f; | |||
| // CPU参考计算 | |||
| double cpu_result = cpuReduceSum(data, static_cast<double>(init_value)); | |||
| // GPU计算 | |||
| float *d_in; | |||
| float *d_out; | |||
| MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_out, sizeof(float))); | |||
| MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| float gpu_result; | |||
| MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost)); | |||
| // 验证误差 | |||
| double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result); | |||
| if (relative_error > REDUCE_ERROR_TOLERANCE) { | |||
| std::cout << " 失败: 误差过大 " << relative_error << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过 (误差: " << relative_error << ")" << std::endl; | |||
| } | |||
| mcFree(d_in); | |||
| mcFree(d_out); | |||
| } | |||
| // 测试特殊值 (NaN, Inf) | |||
| if (size > 100) { | |||
| std::cout << " 测试特殊值..." << std::endl; | |||
| auto data = generator.generateSpecialFloats(size); | |||
| float init_value = 0.0f; | |||
| double cpu_result = cpuReduceSum(data, static_cast<double>(init_value)); | |||
| float *d_in; | |||
| float *d_out; | |||
| MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_out, sizeof(float))); | |||
| MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| float gpu_result; | |||
| MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost)); | |||
| // 对于包含特殊值的情况,检查是否正确处理 | |||
| if (std::isfinite(cpu_result) && std::isfinite(gpu_result)) { | |||
| double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result); | |||
| if (relative_error > REDUCE_ERROR_TOLERANCE) { | |||
| std::cout << " 失败: 特殊值处理错误" << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过 (特殊值处理)" << std::endl; | |||
| } | |||
| } else { | |||
| std::cout << " 通过 (特殊值结果)" << std::endl; | |||
| } | |||
| mcFree(d_in); | |||
| mcFree(d_out); | |||
| } | |||
| } | |||
| return allPassed; | |||
| } | |||
| void benchmarkPerformance() { | |||
| PerformanceDisplay::printReduceSumHeader(); | |||
| TestDataGenerator generator; | |||
| PerformanceMeter meter; | |||
| ReduceSumAlgorithm<float, float> algorithm; | |||
| const int WARMUP_ITERATIONS = 5; | |||
| const int BENCHMARK_ITERATIONS = 10; | |||
| // 用于YAML报告的数据收集 | |||
| std::vector<std::map<std::string, std::string>> perf_data; | |||
| for (int i = 0; i < NUM_TEST_SIZES; i++) { | |||
| int size = TEST_SIZES[i]; | |||
| // 生成测试数据 | |||
| auto data = generator.generateRandomFloats(size); | |||
| float init_value = 0.0f; | |||
| // 分配GPU内存 | |||
| float *d_in; | |||
| float *d_out; | |||
| MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_out, sizeof(float))); | |||
| MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| // Warmup阶段 | |||
| for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| } | |||
| // 正式测试阶段 | |||
| float total_time = 0; | |||
| for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { | |||
| meter.startTiming(); | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| total_time += meter.stopTiming(); | |||
| } | |||
| float avg_time = total_time / BENCHMARK_ITERATIONS; | |||
| // 计算性能指标 | |||
| auto metrics = PerformanceCalculator::calculateReduceSum(size, avg_time); | |||
| // 显示性能数据 | |||
| PerformanceDisplay::printReduceSumData(size, avg_time, metrics); | |||
| // 收集YAML报告数据 | |||
| auto entry = YAMLPerformanceReporter::createEntry(); | |||
| entry["data_size"] = std::to_string(size); | |||
| entry["time_ms"] = std::to_string(avg_time); | |||
| entry["throughput_gps"] = std::to_string(metrics.throughput_gps); | |||
| entry["data_type"] = "float"; | |||
| perf_data.push_back(entry); | |||
| mcFree(d_in); | |||
| mcFree(d_out); | |||
| } | |||
| // 生成YAML性能报告 | |||
| YAMLPerformanceReporter::generateReduceSumYAML(perf_data, "reduce_sum_performance.yaml"); | |||
| PerformanceDisplay::printSavedMessage("reduce_sum_performance.yaml"); | |||
| } | |||
| // ============================================================================ | |||
| // 主函数 | |||
| // ============================================================================ | |||
| int main(int argc, char* argv[]) { | |||
| std::cout << "=== ReduceSum 算法测试 ===" << std::endl; | |||
| // 检查参数 | |||
| std::string mode = "all"; | |||
| if (argc > 1) { | |||
| mode = argv[1]; | |||
| } | |||
| bool correctness_passed = true; | |||
| bool performance_completed = true; | |||
| try { | |||
| if (mode == "correctness" || mode == "all") { | |||
| correctness_passed = testCorrectness(); | |||
| } | |||
| if (mode == "performance" || mode == "all") { | |||
| if (correctness_passed || mode == "performance") { | |||
| benchmarkPerformance(); | |||
| } else { | |||
| std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; | |||
| performance_completed = false; | |||
| } | |||
| } | |||
| std::cout << "\n=== 测试完成 ===" << std::endl; | |||
| std::cout << "实现状态: " << ReduceSumAlgorithm<float, float>::getImplementationStatus() << std::endl; | |||
| if (mode == "all") { | |||
| std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; | |||
| std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; | |||
| } | |||
| return correctness_passed ? 0 : 1; | |||
| } catch (const std::exception& e) { | |||
| std::cerr << "测试出错: " << e.what() << std::endl; | |||
| return 1; | |||
| } | |||
| } | |||
| @@ -0,0 +1,274 @@ | |||
| #!/bin/bash | |||
| # GPU高性能并行计算算法优化竞赛 - 统一编译和运行脚本 | |||
| # 整合了所有算法的编译、运行和公共配置 | |||
| # ============================================================================ | |||
| # 公共配置和工具函数 | |||
| # ============================================================================ | |||
| # 设置颜色 | |||
| RED='\033[0;31m' | |||
| GREEN='\033[0;32m' | |||
| BLUE='\033[0;34m' | |||
| YELLOW='\033[0;33m' | |||
| NC='\033[0m' # No Color | |||
| # 打印函数 | |||
| print_info() { | |||
| echo -e "${BLUE}[INFO]${NC} $1" | |||
| } | |||
| print_success() { | |||
| echo -e "${GREEN}[SUCCESS]${NC} $1" | |||
| } | |||
| print_error() { | |||
| echo -e "${RED}[ERROR]${NC} $1" | |||
| } | |||
| print_warning() { | |||
| echo -e "${YELLOW}[WARNING]${NC} $1" | |||
| } | |||
| # 编译配置 - 可通过环境变量自定义 | |||
| COMPILER=${COMPILER:-mxcc} | |||
| COMPILER_FLAGS=${COMPILER_FLAGS:-"-O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST"} | |||
| # ***** 这里是关键修改点1:头文件目录 ***** | |||
| # 现在头文件在 includes/ 目录下 | |||
| HEADER_DIR=${HEADER_DIR:-utils} | |||
| # ***** 这里是关键修改点2:源文件目录 ***** | |||
| # 现在源文件在 algorithms/ 目录下 | |||
| SOURCE_CODE_DIR=${SOURCE_CODE_DIR:-} | |||
| BUILD_DIR=${BUILD_DIR:-build} | |||
| # 编译单个算法的通用函数 | |||
| # 参数: $1=算法名称, $2=源文件名(不含路径) | |||
| compile_algorithm() { | |||
| local algo_name="$1" | |||
| local source_file_name="$2" # 例如 "reduce_sum_algorithm.maca" | |||
| local target_file="$BUILD_DIR/test_${algo_name,,}" # 转换为小写 | |||
| print_info "编译 $algo_name 算法..." | |||
| # 创建构建目录 | |||
| mkdir -p "$BUILD_DIR" | |||
| # ***** 这里是关键修改点3:编译命令 ***** | |||
| # -I$HEADER_DIR 用于告诉编译器头文件在哪里 | |||
| # $SOURCE_CODE_DIR/$source_file_name 用于指定要编译的源文件的完整路径 | |||
| local compile_cmd="$COMPILER $COMPILER_FLAGS -I$HEADER_DIR $source_file_name -o $target_file" | |||
| print_info "执行: $compile_cmd" | |||
| if $compile_cmd; then | |||
| print_success "$algo_name 编译完成!" | |||
| echo "" | |||
| echo "运行测试:" | |||
| echo " ./$target_file [correctness|performance|all]" | |||
| return 0 | |||
| else | |||
| print_error "$algo_name 编译失败!" | |||
| return 1 | |||
| fi | |||
| } | |||
| # 显示编译配置信息 | |||
| show_build_config() { | |||
| print_info "编译配置:" | |||
| echo " COMPILER: $COMPILER" | |||
| echo " COMPILER_FLAGS: $COMPILER_FLAGS" | |||
| echo " HEADER_DIR: $HEADER_DIR" # 显示头文件目录 | |||
| echo " SOURCE_CODE_DIR: $SOURCE_CODE_DIR" # 显示源文件目录 | |||
| echo " BUILD_DIR: $BUILD_DIR" | |||
| echo "" | |||
| } | |||
| # 运行单个测试 | |||
| run_single_test() { | |||
| local algo_name="$1" | |||
| local test_mode="${2:-all}" | |||
| local test_file="$BUILD_DIR/test_${algo_name,,}" | |||
| if [ -f "$test_file" ]; then | |||
| print_info "运行 $algo_name 测试 (模式: $test_mode)..." | |||
| "./$test_file" "$test_mode" | |||
| return $? | |||
| else | |||
| print_error "$algo_name 测试程序不存在: $test_file" | |||
| return 1 | |||
| fi | |||
| } | |||
| # ============================================================================ | |||
| # 主脚本逻辑 | |||
| # ============================================================================ | |||
| # 显示帮助信息 (整合了所有选项) | |||
| show_help() { | |||
| echo "GPU算法竞赛统一编译和运行脚本" | |||
| echo "用法: $0 [选项]" | |||
| echo "" | |||
| echo "选项:" | |||
| echo " --help 显示帮助信息" | |||
| echo " --build-only 仅编译所有算法,不运行测试" | |||
| echo " --run_reduce [MODE] 编译并运行ReduceSum算法测试 (MODE: correctness|performance|all, 默认all)" | |||
| echo " --run_sort [MODE] 编译并运行SortPair算法测试 (MODE: correctness|performance|all, 默认all)" | |||
| echo " --run_topk [MODE] 编译并运行TopkPair算法测试 (MODE: correctness|performance|all, 默认all)" | |||
| echo "" | |||
| echo "示例:" | |||
| echo " $0 # 编译并运行所有测试(默认行为)" | |||
| echo " $0 --build-only # 仅编译所有算法" | |||
| echo " $0 --run_sort performance # 编译并运行SortPair性能测试" | |||
| echo "" | |||
| } | |||
| # 解析命令行参数 | |||
| RUN_MODE="run_all" # 默认为编译并运行所有测试 | |||
| ALGO_TO_RUN="" # 记录要运行的单个算法 | |||
| SINGLE_ALGO_TEST_MODE="all" # 单个算法的测试模式 | |||
| while [[ $# -gt 0 ]]; do | |||
| case $1 in | |||
| --help) | |||
| show_help | |||
| exit 0 | |||
| ;; | |||
| --build-only) | |||
| RUN_MODE="build_only" | |||
| shift | |||
| ;; | |||
| --run_reduce) | |||
| RUN_MODE="run_single" | |||
| ALGO_TO_RUN="ReduceSum" | |||
| if [[ -n "$2" && "$2" != --* ]]; then | |||
| SINGLE_ALGO_TEST_MODE="$2" | |||
| shift | |||
| fi | |||
| shift | |||
| ;; | |||
| --run_sort) | |||
| RUN_MODE="run_single" | |||
| ALGO_TO_RUN="SortPair" | |||
| if [[ -n "$2" && "$2" != --* ]]; then | |||
| SINGLE_ALGO_TEST_MODE="$2" | |||
| shift | |||
| fi | |||
| shift | |||
| ;; | |||
| --run_topk) | |||
| RUN_MODE="run_single" | |||
| ALGO_TO_RUN="TopkPair" | |||
| if [[ -n "$2" && "$2" != --* ]]; then | |||
| SINGLE_ALGO_TEST_MODE="$2" | |||
| shift | |||
| fi | |||
| shift | |||
| ;; | |||
| *) | |||
| print_error "未知选项: $1" | |||
| show_help | |||
| exit 1 | |||
| ;; | |||
| esac | |||
| done | |||
| if [ "$RUN_MODE" = "build_only" ]; then | |||
| print_info "开始编译所有算法..." | |||
| else | |||
| print_info "开始编译并运行所有算法..." | |||
| fi | |||
| print_info "工作目录: $(pwd)" | |||
| print_info "编译时间: $(date '+%Y-%m-%d %H:%M:%S')" | |||
| show_build_config | |||
| # 清理构建目录 | |||
| if [ -d "$BUILD_DIR" ]; then | |||
| print_info "清理现有构建目录: $BUILD_DIR" | |||
| rm -rf "$BUILD_DIR" | |||
| fi | |||
| # 核心逻辑:根据 RUN_MODE 执行操作 | |||
| case "$RUN_MODE" in | |||
| "build_only") | |||
| print_info "编译所有算法..." | |||
| # 直接调用 compile_algorithm 函数 | |||
| print_info "[1/3] 编译ReduceSum..." | |||
| if ! compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then | |||
| print_error "ReduceSum编译失败" | |||
| exit 1 | |||
| fi | |||
| print_info "[2/3] 编译SortPair..." | |||
| if ! compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then | |||
| print_error "SortPair编译失败" | |||
| exit 1 | |||
| fi | |||
| print_info "[3/3] 编译TopkPair..." | |||
| if ! compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then | |||
| print_error "TopkPair编译失败" | |||
| exit 1 | |||
| fi | |||
| print_success "所有算法编译完成!" | |||
| echo "" | |||
| echo "可执行文件:" | |||
| echo " $BUILD_DIR/test_reducesum - ReduceSum算法测试" | |||
| echo " $BUILD_DIR/test_sortpair - SortPair算法测试" | |||
| echo " $BUILD_DIR/test_topkpair - TopkPair算法测试" | |||
| echo "" | |||
| echo "使用方法:" | |||
| echo " ./$BUILD_DIR/test_reducesum [correctness|performance|all]" | |||
| echo " ./$BUILD_DIR/test_sortpair [correctness|performance|all]" | |||
| echo " ./$BUILD_DIR/test_topkpair [correctness|performance|all]" | |||
| ;; | |||
| "run_all") | |||
| print_info "编译并运行所有算法测试..." | |||
| # 直接调用 compile_algorithm 和 run_single_test 函数 | |||
| print_info "[1/3] ReduceSum..." | |||
| if compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then | |||
| run_single_test "ReduceSum" "all" | |||
| else | |||
| exit 1 | |||
| fi | |||
| print_info "[2/3] SortPair..." | |||
| if compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then | |||
| run_single_test "SortPair" "all" | |||
| else | |||
| exit 1 | |||
| fi | |||
| print_info "[3/3] TopkPair..." | |||
| if compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then | |||
| run_single_test "TopkPair" "all" | |||
| else | |||
| exit 1 | |||
| fi | |||
| print_success "所有测试完成!" | |||
| ;; | |||
| "run_single") | |||
| print_info "编译并运行 ${ALGO_TO_RUN} 测试 (模式: ${SINGLE_ALGO_TEST_MODE})..." | |||
| local source_file_name="" | |||
| case "$ALGO_TO_RUN" in | |||
| "ReduceSum") source_file_name="reduce_sum_algorithm.maca" ;; | |||
| "SortPair") source_file_name="sort_pair_algorithm.maca" ;; | |||
| "TopkPair") source_file_name="topk_pair_algorithm.maca" ;; | |||
| esac | |||
| if compile_algorithm "$ALGO_TO_RUN" "$source_file_name"; then | |||
| run_single_test "$ALGO_TO_RUN" "$SINGLE_ALGO_TEST_MODE" | |||
| else | |||
| exit 1 | |||
| fi | |||
| ;; | |||
| esac | |||
| @@ -0,0 +1,275 @@ | |||
| #include "test_utils.h" | |||
| #include "performance_utils.h" | |||
| #include "yaml_reporter.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| #include <iomanip> | |||
| // ============================================================================ | |||
| // 实现标记宏 - 参赛者修改实现时请将此宏设为0 | |||
| // ============================================================================ | |||
| #ifndef USE_DEFAULT_REF_IMPL | |||
| #define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 | |||
| #endif | |||
| #if USE_DEFAULT_REF_IMPL | |||
| #include <thrust/sort.h> | |||
| #include <thrust/device_vector.h> | |||
| #include <thrust/execution_policy.h> | |||
| #include <thrust/iterator/zip_iterator.h> | |||
| #include <thrust/tuple.h> | |||
| #endif | |||
| // ============================================================================ | |||
| // SortPair算法实现接口 | |||
| // 参赛者需要替换Thrust实现为自己的高性能kernel | |||
| // ============================================================================ | |||
| template <typename KeyType, typename ValueType> | |||
| class SortPairAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void sort(const KeyType* d_keys_in, KeyType* d_keys_out, | |||
| const ValueType* d_values_in, ValueType* d_values_out, | |||
| int num_items, bool descending) { | |||
| #if !USE_DEFAULT_REF_IMPL | |||
| // ======================================== | |||
| // 参赛者自定义实现区域 | |||
| // ======================================== | |||
| // TODO: 参赛者在此实现自己的高性能排序算法 | |||
| // 示例:参赛者可以调用1个或多个自定义kernel | |||
| // preprocessKernel<<<grid, block>>>(d_keys_in, d_values_in, num_items); | |||
| // mainSortKernel<<<grid, block>>>(d_keys_out, d_values_out, num_items, descending); | |||
| // postprocessKernel<<<grid, block>>>(d_keys_out, d_values_out, num_items); | |||
| #else | |||
| // ======================================== | |||
| // 默认基准实现 | |||
| // ======================================== | |||
| MACA_CHECK(mcMemcpy(d_keys_out, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_out, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice)); | |||
| auto key_ptr = thrust::device_pointer_cast(d_keys_out); | |||
| auto value_ptr = thrust::device_pointer_cast(d_values_out); | |||
| if (descending) { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater<KeyType>()); | |||
| } else { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less<KeyType>()); | |||
| } | |||
| #endif | |||
| } | |||
| // 获取当前实现状态 | |||
| static const char* getImplementationStatus() { | |||
| #if USE_DEFAULT_REF_IMPL | |||
| return "DEFAULT_REF_IMPL"; | |||
| #else | |||
| return "CUSTOM_IMPL"; | |||
| #endif | |||
| } | |||
| private: | |||
| // 参赛者可以在这里添加辅助函数和成员变量 | |||
| // 例如:临时缓冲区、多个kernel函数、流等 | |||
| }; | |||
| // ============================================================================ | |||
| // 测试和性能评估 | |||
| // ============================================================================ | |||
| bool testCorrectness() { | |||
| std::cout << "SortPair 正确性测试..." << std::endl; | |||
| TestDataGenerator generator; | |||
| SortPairAlgorithm<float, uint32_t> algorithm; | |||
| // 测试小规模数据 | |||
| int size = 10000; | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in, *d_keys_out; | |||
| uint32_t *d_values_in, *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| // 测试升序和降序 | |||
| bool allPassed = true; | |||
| for (bool descending : {false, true}) { | |||
| std::cout << " " << (descending ? "降序" : "升序") << " 测试..." << std::endl; | |||
| // CPU参考结果 | |||
| auto cpu_keys = keys; | |||
| auto cpu_values = values; | |||
| cpuSortPair(cpu_keys, cpu_values, descending); | |||
| // GPU算法结果 | |||
| algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); | |||
| // 获取结果 | |||
| std::vector<float> gpu_keys(size); | |||
| std::vector<uint32_t> gpu_values(size); | |||
| MACA_CHECK(mcMemcpy(gpu_keys.data(), d_keys_out, size * sizeof(float), mcMemcpyDeviceToHost)); | |||
| MACA_CHECK(mcMemcpy(gpu_values.data(), d_values_out, size * sizeof(uint32_t), mcMemcpyDeviceToHost)); | |||
| // 验证结果 | |||
| bool keysMatch = compareArrays(cpu_keys, gpu_keys, 1e-5); | |||
| bool valuesMatch = compareArrays(cpu_values, gpu_values); | |||
| if (!keysMatch || !valuesMatch) { | |||
| std::cout << " 失败: 结果不匹配" << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过" << std::endl; | |||
| } | |||
| } | |||
| // 清理内存 | |||
| mcFree(d_keys_in); | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_in); | |||
| mcFree(d_values_out); | |||
| return allPassed; | |||
| } | |||
| void benchmarkPerformance() { | |||
| PerformanceDisplay::printSortPairHeader(); | |||
| TestDataGenerator generator; | |||
| PerformanceMeter meter; | |||
| SortPairAlgorithm<float, uint32_t> algorithm; | |||
| const int WARMUP_ITERATIONS = 5; | |||
| const int BENCHMARK_ITERATIONS = 10; | |||
| // 用于YAML报告的数据收集 | |||
| std::vector<std::map<std::string, std::string>> perf_data; | |||
| for (int i = 0; i < NUM_TEST_SIZES; i++) { | |||
| int size = TEST_SIZES[i]; | |||
| // 生成测试数据 | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in, *d_keys_out; | |||
| uint32_t *d_values_in, *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| float asc_time = 0, desc_time = 0; | |||
| // 测试升序和降序 | |||
| for (bool descending : {false, true}) { | |||
| // Warmup阶段 | |||
| for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { | |||
| algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); | |||
| } | |||
| // 正式测试阶段 | |||
| float total_time = 0; | |||
| for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { | |||
| meter.startTiming(); | |||
| algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); | |||
| total_time += meter.stopTiming(); | |||
| } | |||
| float avg_time = total_time / BENCHMARK_ITERATIONS; | |||
| if (descending) { | |||
| desc_time = avg_time; | |||
| } else { | |||
| asc_time = avg_time; | |||
| } | |||
| } | |||
| // 计算性能指标 | |||
| auto asc_metrics = PerformanceCalculator::calculateSortPair(size, asc_time); | |||
| auto desc_metrics = PerformanceCalculator::calculateSortPair(size, desc_time); | |||
| // 显示性能数据 | |||
| PerformanceDisplay::printSortPairData(size, asc_time, desc_time, asc_metrics, desc_metrics); | |||
| // 收集YAML报告数据 | |||
| auto entry = YAMLPerformanceReporter::createEntry(); | |||
| entry["data_size"] = std::to_string(size); | |||
| entry["asc_time_ms"] = std::to_string(asc_time); | |||
| entry["desc_time_ms"] = std::to_string(desc_time); | |||
| entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps); | |||
| entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps); | |||
| entry["key_type"] = "float"; | |||
| entry["value_type"] = "uint32_t"; | |||
| perf_data.push_back(entry); | |||
| // 清理内存 | |||
| mcFree(d_keys_in); | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_in); | |||
| mcFree(d_values_out); | |||
| } | |||
| // 生成YAML性能报告 | |||
| YAMLPerformanceReporter::generateSortPairYAML(perf_data, "sort_pair_performance.yaml"); | |||
| PerformanceDisplay::printSavedMessage("sort_pair_performance.yaml"); | |||
| } | |||
| // ============================================================================ | |||
| // 主函数 | |||
| // ============================================================================ | |||
| int main(int argc, char* argv[]) { | |||
| std::cout << "=== SortPair 算法测试 ===" << std::endl; | |||
| // 检查参数 | |||
| std::string mode = "all"; | |||
| if (argc > 1) { | |||
| mode = argv[1]; | |||
| } | |||
| bool correctness_passed = true; | |||
| bool performance_completed = true; | |||
| try { | |||
| if (mode == "correctness" || mode == "all") { | |||
| correctness_passed = testCorrectness(); | |||
| } | |||
| if (mode == "performance" || mode == "all") { | |||
| if (correctness_passed || mode == "performance") { | |||
| benchmarkPerformance(); | |||
| } else { | |||
| std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; | |||
| performance_completed = false; | |||
| } | |||
| } | |||
| std::cout << "\n=== 测试完成 ===" << std::endl; | |||
| std::cout << "实现状态: " << SortPairAlgorithm<float, uint32_t>::getImplementationStatus() << std::endl; | |||
| if (mode == "all") { | |||
| std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; | |||
| std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; | |||
| } | |||
| return correctness_passed ? 0 : 1; | |||
| } catch (const std::exception& e) { | |||
| std::cerr << "测试出错: " << e.what() << std::endl; | |||
| return 1; | |||
| } | |||
| } | |||
| @@ -0,0 +1,317 @@ | |||
| #include "test_utils.h" | |||
| #include "performance_utils.h" | |||
| #include "yaml_reporter.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| #include <iomanip> | |||
| #include <fstream> | |||
| #include <map> | |||
| #include <chrono> | |||
| // ============================================================================ | |||
| // 实现标记宏 - 参赛者修改实现时请将此宏设为0 | |||
| // ============================================================================ | |||
| #ifndef USE_DEFAULT_REF_IMPL | |||
| #define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 | |||
| #endif | |||
| #if USE_DEFAULT_REF_IMPL | |||
| #include <thrust/sort.h> | |||
| #include <thrust/device_vector.h> | |||
| #include <thrust/execution_policy.h> | |||
| #include <thrust/iterator/zip_iterator.h> | |||
| #include <thrust/tuple.h> | |||
| #include <thrust/copy.h> | |||
| #endif | |||
| static const int TOPK_VALUES[] = {32, 50, 100, 256, 1024}; | |||
| static const int NUM_TOPK_VALUES = sizeof(TOPK_VALUES) / sizeof(TOPK_VALUES[0]); | |||
| // ============================================================================ | |||
| // TopkPair算法实现接口 | |||
| // 参赛者需要替换Thrust实现为自己的高性能kernel | |||
| // ============================================================================ | |||
| template <typename KeyType, typename ValueType> | |||
| class TopkPairAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void topk(const KeyType* d_keys_in, KeyType* d_keys_out, | |||
| const ValueType* d_values_in, ValueType* d_values_out, | |||
| int num_items, int k, bool descending) { | |||
| #if !USE_DEFAULT_REF_IMPL | |||
| // ======================================== | |||
| // 参赛者自定义实现区域 | |||
| // ======================================== | |||
| // TODO: 参赛者在此实现自己的高性能TopK算法 | |||
| // 示例:参赛者可以调用多个自定义kernel | |||
| // TopkKernel1<<<grid, block>>>(d_keys_in, d_values_in, temp_results, num_items, k); | |||
| // TopkKernel2<<<grid, block>>>(temp_results, d_keys_out, d_values_out, k, descending); | |||
| #else | |||
| // ======================================== | |||
| // 默认基准实现 | |||
| // ======================================== | |||
| KeyType* temp_keys; | |||
| ValueType* temp_values; | |||
| MACA_CHECK(mcMalloc(&temp_keys, num_items * sizeof(KeyType))); | |||
| MACA_CHECK(mcMalloc(&temp_values, num_items * sizeof(ValueType))); | |||
| MACA_CHECK(mcMemcpy(temp_keys, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice)); | |||
| MACA_CHECK(mcMemcpy(temp_values, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice)); | |||
| auto key_ptr = thrust::device_pointer_cast(temp_keys); | |||
| auto value_ptr = thrust::device_pointer_cast(temp_values); | |||
| // 由于greater和less是不同类型,需要分别调用 | |||
| if (descending) { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater<KeyType>()); | |||
| } else { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less<KeyType>()); | |||
| } | |||
| MACA_CHECK(mcMemcpy(d_keys_out, temp_keys, k * sizeof(KeyType), mcMemcpyDeviceToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_out, temp_values, k * sizeof(ValueType), mcMemcpyDeviceToDevice)); | |||
| mcFree(temp_keys); | |||
| mcFree(temp_values); | |||
| #endif | |||
| } | |||
| // 获取当前实现状态 | |||
| static const char* getImplementationStatus() { | |||
| #if USE_DEFAULT_REF_IMPL | |||
| return "DEFAULT_REF_IMPL"; | |||
| #else | |||
| return "CUSTOM_IMPL"; | |||
| #endif | |||
| } | |||
| private: | |||
| // 参赛者可以在这里添加辅助函数和成员变量 | |||
| // 例如:分块大小、临时缓冲区、多流处理等 | |||
| }; | |||
| // ============================================================================ | |||
| // 测试和性能评估 | |||
| // ============================================================================ | |||
| bool testCorrectness() { | |||
| std::cout << "TopkPair 正确性测试..." << std::endl; | |||
| TestDataGenerator generator; | |||
| TopkPairAlgorithm<float, uint32_t> algorithm; | |||
| int size = 10000; | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in, *d_keys_out; | |||
| uint32_t *d_values_in, *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| bool allPassed = true; | |||
| // 测试不同k值 | |||
| for (int ki = 0; ki < NUM_TOPK_VALUES && ki < 4; ki++) { // 限制测试范围 | |||
| int k = TOPK_VALUES[ki]; | |||
| if (k > size) continue; | |||
| std::cout << " 测试 k=" << k << std::endl; | |||
| MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t))); | |||
| for (bool descending : {false, true}) { | |||
| std::cout << " " << (descending ? "降序" : "升序") << " TopK..." << std::endl; | |||
| // CPU参考结果 | |||
| std::vector<float> cpu_keys_out; | |||
| std::vector<uint32_t> cpu_values_out; | |||
| cpuTopkPair(keys, values, cpu_keys_out, cpu_values_out, k, descending); | |||
| // GPU算法结果 | |||
| algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); | |||
| // 获取结果 | |||
| std::vector<float> gpu_keys_out(k); | |||
| std::vector<uint32_t> gpu_values_out(k); | |||
| MACA_CHECK(mcMemcpy(gpu_keys_out.data(), d_keys_out, k * sizeof(float), mcMemcpyDeviceToHost)); | |||
| MACA_CHECK(mcMemcpy(gpu_values_out.data(), d_values_out, k * sizeof(uint32_t), mcMemcpyDeviceToHost)); | |||
| // 验证结果 | |||
| bool keysMatch = compareArrays(cpu_keys_out, gpu_keys_out, 1e-5); | |||
| bool valuesMatch = compareArrays(cpu_values_out, gpu_values_out); | |||
| if (!keysMatch || !valuesMatch) { | |||
| std::cout << " 失败: 结果不匹配" << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过" << std::endl; | |||
| } | |||
| } | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_out); | |||
| } | |||
| // 清理内存 | |||
| mcFree(d_keys_in); | |||
| mcFree(d_values_in); | |||
| return allPassed; | |||
| } | |||
| void benchmarkPerformance() { | |||
| std::cout << "\nTopkPair 性能测试..." << std::endl; | |||
| std::cout << "数据类型: <float, uint32_t>" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| TestDataGenerator generator; | |||
| PerformanceMeter meter; | |||
| TopkPairAlgorithm<float, uint32_t> algorithm; | |||
| const int WARMUP_ITERATIONS = 5; | |||
| const int BENCHMARK_ITERATIONS = 10; | |||
| // 用于YAML报告的数据收集 | |||
| std::vector<std::map<std::string, std::string>> perf_data; | |||
| // 针对不同数据规模测试 | |||
| for (int size_idx = 0; size_idx < NUM_TEST_SIZES; size_idx++) { | |||
| int size = TEST_SIZES[size_idx]; | |||
| std::cout << "\n数据规模: " << size << std::endl; | |||
| std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" | |||
| << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; | |||
| std::cout << std::string(74, '-') << std::endl; | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in; | |||
| uint32_t *d_values_in; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| for (int ki = 0; ki < NUM_TOPK_VALUES; ki++) { | |||
| int k = TOPK_VALUES[ki]; | |||
| if (k > size) continue; | |||
| float *d_keys_out; | |||
| uint32_t *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t))); | |||
| float asc_time = 0, desc_time = 0; | |||
| for (bool descending : {false, true}) { | |||
| // Warmup阶段 | |||
| for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { | |||
| algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); | |||
| } | |||
| // 正式测试阶段 | |||
| float total_time = 0; | |||
| for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { | |||
| meter.startTiming(); | |||
| algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); | |||
| total_time += meter.stopTiming(); | |||
| } | |||
| float avg_time = total_time / BENCHMARK_ITERATIONS; | |||
| if (descending) { | |||
| desc_time = avg_time; | |||
| } else { | |||
| asc_time = avg_time; | |||
| } | |||
| } | |||
| // 计算性能指标 | |||
| auto asc_metrics = PerformanceCalculator::calculateTopkPair(size, k, asc_time); | |||
| auto desc_metrics = PerformanceCalculator::calculateTopkPair(size, k, desc_time); | |||
| // 显示性能数据 | |||
| PerformanceDisplay::printTopkPairData(k, asc_time, desc_time, asc_metrics, desc_metrics); | |||
| // 收集YAML报告数据 | |||
| auto entry = YAMLPerformanceReporter::createEntry(); | |||
| entry["data_size"] = std::to_string(size); | |||
| entry["k_value"] = std::to_string(k); | |||
| entry["asc_time_ms"] = std::to_string(asc_time); | |||
| entry["desc_time_ms"] = std::to_string(desc_time); | |||
| entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps); | |||
| entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps); | |||
| entry["key_type"] = "float"; | |||
| entry["value_type"] = "uint32_t"; | |||
| perf_data.push_back(entry); | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_out); | |||
| } | |||
| mcFree(d_keys_in); | |||
| mcFree(d_values_in); | |||
| } | |||
| // 生成YAML性能报告 | |||
| YAMLPerformanceReporter::generateTopkPairYAML(perf_data, "topk_pair_performance.yaml"); | |||
| PerformanceDisplay::printSavedMessage("topk_pair_performance.yaml"); | |||
| } | |||
| // ============================================================================ | |||
| // 主函数 | |||
| // ============================================================================ | |||
| int main(int argc, char* argv[]) { | |||
| std::cout << "=== TopkPair 算法测试 ===" << std::endl; | |||
| // 检查参数 | |||
| std::string mode = "all"; | |||
| if (argc > 1) { | |||
| mode = argv[1]; | |||
| } | |||
| bool correctness_passed = true; | |||
| bool performance_completed = true; | |||
| try { | |||
| if (mode == "correctness" || mode == "all") { | |||
| correctness_passed = testCorrectness(); | |||
| } | |||
| if (mode == "performance" || mode == "all") { | |||
| if (correctness_passed || mode == "performance") { | |||
| benchmarkPerformance(); | |||
| } else { | |||
| std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; | |||
| performance_completed = false; | |||
| } | |||
| } | |||
| std::cout << "\n=== 测试完成 ===" << std::endl; | |||
| std::cout << "实现状态: " << TopkPairAlgorithm<float, uint32_t>::getImplementationStatus() << std::endl; | |||
| if (mode == "all") { | |||
| std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; | |||
| std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; | |||
| } | |||
| return correctness_passed ? 0 : 1; | |||
| } catch (const std::exception& e) { | |||
| std::cerr << "测试出错: " << e.what() << std::endl; | |||
| return 1; | |||
| } | |||
| } | |||
| @@ -0,0 +1,114 @@ | |||
| #pragma once | |||
| #include <iostream> | |||
| #include <iomanip> | |||
| #include <string> | |||
| // ============================================================================ | |||
| // 性能计算和显示工具 | |||
| // ============================================================================ | |||
| class PerformanceCalculator { | |||
| public: | |||
| // ReduceSum性能计算 | |||
| struct ReduceSumMetrics { | |||
| double throughput_gps; // G elements/s | |||
| }; | |||
| static ReduceSumMetrics calculateReduceSum(int size, float time_ms) { | |||
| ReduceSumMetrics metrics; | |||
| metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); | |||
| return metrics; | |||
| } | |||
| // SortPair性能计算 | |||
| struct SortPairMetrics { | |||
| double throughput_gps; // G elements/s | |||
| }; | |||
| static SortPairMetrics calculateSortPair(int size, float time_ms) { | |||
| SortPairMetrics metrics; | |||
| metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); | |||
| return metrics; | |||
| } | |||
| // TopkPair性能计算 | |||
| struct TopkPairMetrics { | |||
| double throughput_gps; // G elements/s | |||
| }; | |||
| static TopkPairMetrics calculateTopkPair(int size, int k, float time_ms) { | |||
| TopkPairMetrics metrics; | |||
| metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); | |||
| return metrics; | |||
| } | |||
| }; | |||
| // ============================================================================ | |||
| // 性能显示工具 | |||
| // ============================================================================ | |||
| class PerformanceDisplay { | |||
| public: | |||
| // 显示ReduceSum性能表头 | |||
| static void printReduceSumHeader() { | |||
| std::cout << "\nReduceSum 性能测试..." << std::endl; | |||
| std::cout << "数据类型: float -> float" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| std::cout << std::setw(12) << "数据规模" << std::setw(15) << "时间(ms)" | |||
| << std::setw(20) << "吞吐量(G/s)" << std::endl; | |||
| std::cout << std::string(47, '-') << std::endl; | |||
| } | |||
| // 显示SortPair性能表头 | |||
| static void printSortPairHeader() { | |||
| std::cout << "\nSortPair 性能测试..." << std::endl; | |||
| std::cout << "数据类型: <float, uint32_t>" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| std::cout << std::setw(12) << "数据规模" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" | |||
| << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; | |||
| std::cout << std::string(78, '-') << std::endl; | |||
| } | |||
| // 显示TopkPair性能表头 | |||
| static void printTopkPairHeader() { | |||
| std::cout << "\nTopkPair 性能测试..." << std::endl; | |||
| std::cout << "数据类型: <float, uint32_t>" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| } | |||
| static void printTopkPairDataHeader() { | |||
| std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" | |||
| << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; | |||
| std::cout << std::string(74, '-') << std::endl; | |||
| } | |||
| // 显示性能数据行 | |||
| static void printReduceSumData(int size, float time_ms, const PerformanceCalculator::ReduceSumMetrics& metrics) { | |||
| std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) | |||
| << time_ms << std::setw(20) << std::setprecision(3) << metrics.throughput_gps << std::endl; | |||
| } | |||
| static void printSortPairData(int size, float asc_time, float desc_time, | |||
| const PerformanceCalculator::SortPairMetrics& asc_metrics, | |||
| const PerformanceCalculator::SortPairMetrics& desc_metrics) { | |||
| std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) | |||
| << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) | |||
| << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl; | |||
| } | |||
| static void printTopkPairData(int k, float asc_time, float desc_time, | |||
| const PerformanceCalculator::TopkPairMetrics& asc_metrics, | |||
| const PerformanceCalculator::TopkPairMetrics& desc_metrics) { | |||
| std::cout << std::setw(8) << k << std::setw(15) << std::fixed << std::setprecision(3) | |||
| << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) | |||
| << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl; | |||
| } | |||
| // 显示性能文件保存消息 | |||
| static void printSavedMessage(const std::string& filename) { | |||
| std::cout << "\n性能结果已保存到: " << filename << std::endl; | |||
| } | |||
| }; | |||
| @@ -0,0 +1,234 @@ | |||
| #pragma once | |||
| #include <vector> | |||
| #include <random> | |||
| #include <algorithm> | |||
| #include <mc_runtime.h> | |||
| #include <maca_fp16.h> | |||
| #include <iostream> | |||
| #include <chrono> | |||
| #include <cmath> | |||
| // 引入模块化头文件 | |||
| #include "yaml_reporter.h" | |||
| #include "performance_utils.h" | |||
| // ============================================================================ | |||
| // 测试配置常量 | |||
| // ============================================================================ | |||
| #ifndef RUN_FULL_TEST | |||
| const int TEST_SIZES[] = {1000000, 134217728}; // 1M, 128M, 512M, 1G | |||
| #else | |||
| const int TEST_SIZES[] = {1000000, 134217728, 536870912, 1073741824}; // 1M, 128M, 512M, 1G | |||
| #endif | |||
| const int NUM_TEST_SIZES = sizeof(TEST_SIZES) / sizeof(TEST_SIZES[0]); | |||
| // 性能测试重复次数 | |||
| constexpr int WARMUP_ITERATIONS = 5; | |||
| constexpr int BENCHMARK_ITERATIONS = 10; | |||
| // ============================================================================ | |||
| // 错误检查宏 | |||
| // ============================================================================ | |||
| #define MACA_CHECK(call) \ | |||
| do { \ | |||
| mcError_t error = call; \ | |||
| if (error != mcSuccess) { \ | |||
| std::cerr << "MACA error at " << __FILE__ << ":" << __LINE__ \ | |||
| << " - " << mcGetErrorString(error) << std::endl; \ | |||
| exit(1); \ | |||
| } \ | |||
| } while(0) | |||
| // ============================================================================ | |||
| // 测试数据生成器 | |||
| // ============================================================================ | |||
| class TestDataGenerator { | |||
| private: | |||
| std::mt19937 rng; | |||
| public: | |||
| TestDataGenerator(uint32_t seed = 42) : rng(seed) {} | |||
| // 生成随机float数组 | |||
| std::vector<float> generateRandomFloats(int size, float min_val = -1000.0f, float max_val = 1000.0f) { | |||
| std::vector<float> data(size); | |||
| std::uniform_real_distribution<float> dist(min_val, max_val); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = dist(rng); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成随机half数组 | |||
| std::vector<half> generateRandomHalfs(int size, float min_val = -100.0f, float max_val = 100.0f) { | |||
| std::vector<half> data(size); | |||
| std::uniform_real_distribution<float> dist(min_val, max_val); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = __float2half(dist(rng)); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成随机uint32_t数组 | |||
| std::vector<uint32_t> generateRandomUint32(int size) { | |||
| std::vector<uint32_t> data(size); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = static_cast<uint32_t>(i); // 使用索引作为值,便于验证稳定排序 | |||
| } | |||
| return data; | |||
| } | |||
| // 生成随机int64_t数组 | |||
| std::vector<int64_t> generateRandomInt64(int size) { | |||
| std::vector<int64_t> data(size); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = static_cast<int64_t>(i); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成包含NaN和Inf的测试数据 (half版本) | |||
| std::vector<half> generateSpecialHalfs(int size) { | |||
| std::vector<half> data = generateRandomHalfs(size, -10.0f, 10.0f); | |||
| if (size > 100) { | |||
| data[10] = __float2half(NAN); | |||
| data[20] = __float2half(INFINITY); | |||
| data[30] = __float2half(-INFINITY); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成包含NaN和Inf的测试数据 (float版本) | |||
| std::vector<float> generateSpecialFloats(int size) { | |||
| std::vector<float> data = generateRandomFloats(size, -10.0f, 10.0f); | |||
| if (size > 100) { | |||
| data[10] = NAN; | |||
| data[20] = INFINITY; | |||
| data[30] = -INFINITY; | |||
| } | |||
| return data; | |||
| } | |||
| }; | |||
| // ============================================================================ | |||
| // 性能测试工具 | |||
| // ============================================================================ | |||
| class PerformanceMeter { | |||
| private: | |||
| mcEvent_t start, stop; | |||
| public: | |||
| PerformanceMeter() { | |||
| MACA_CHECK(mcEventCreate(&start)); | |||
| MACA_CHECK(mcEventCreate(&stop)); | |||
| } | |||
| ~PerformanceMeter() { | |||
| mcEventDestroy(start); | |||
| mcEventDestroy(stop); | |||
| } | |||
| void startTiming() { | |||
| MACA_CHECK(mcEventRecord(start)); | |||
| } | |||
| float stopTiming() { | |||
| MACA_CHECK(mcEventRecord(stop)); | |||
| MACA_CHECK(mcEventSynchronize(stop)); | |||
| float milliseconds = 0; | |||
| MACA_CHECK(mcEventElapsedTime(&milliseconds, start, stop)); | |||
| return milliseconds; | |||
| } | |||
| }; | |||
| // ============================================================================ | |||
| // 正确性验证工具 | |||
| // ============================================================================ | |||
| template<typename T> | |||
| bool compareArrays(const std::vector<T>& a, const std::vector<T>& b, double tolerance = 1e-6) { | |||
| if (a.size() != b.size()) return false; | |||
| for (size_t i = 0; i < a.size(); i++) { | |||
| if constexpr (std::is_same_v<T, half>) { | |||
| float fa = __half2float(a[i]); | |||
| float fb = __half2float(b[i]); | |||
| if (std::isnan(fa) && std::isnan(fb)) continue; | |||
| if (std::isinf(fa) && std::isinf(fb) && (fa > 0) == (fb > 0)) continue; | |||
| if (std::abs(fa - fb) > tolerance) return false; | |||
| } else if constexpr (std::is_floating_point_v<T>) { | |||
| if (std::isnan(a[i]) && std::isnan(b[i])) continue; | |||
| if (std::isinf(a[i]) && std::isinf(b[i]) && (a[i] > 0) == (b[i] > 0)) continue; | |||
| if (std::abs(a[i] - b[i]) > tolerance) return false; | |||
| } else { | |||
| if (a[i] != b[i]) return false; | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| // CPU参考实现 - 稳定排序 | |||
| template<typename KeyType, typename ValueType> | |||
| void cpuSortPair(std::vector<KeyType>& keys, std::vector<ValueType>& values, bool descending) { | |||
| std::vector<std::pair<KeyType, ValueType>> pairs; | |||
| for (size_t i = 0; i < keys.size(); i++) { | |||
| pairs.emplace_back(keys[i], values[i]); | |||
| } | |||
| if (descending) { | |||
| std::stable_sort(pairs.begin(), pairs.end(), | |||
| [](const auto& a, const auto& b) { return a.first > b.first; }); | |||
| } else { | |||
| std::stable_sort(pairs.begin(), pairs.end()); | |||
| } | |||
| for (size_t i = 0; i < pairs.size(); i++) { | |||
| keys[i] = pairs[i].first; | |||
| values[i] = pairs[i].second; | |||
| } | |||
| } | |||
| // CPU参考实现 - TopK | |||
| template<typename KeyType, typename ValueType> | |||
| void cpuTopkPair(const std::vector<KeyType>& keys_in, const std::vector<ValueType>& values_in, | |||
| std::vector<KeyType>& keys_out, std::vector<ValueType>& values_out, | |||
| int k, bool descending) { | |||
| std::vector<std::pair<KeyType, ValueType>> pairs; | |||
| for (size_t i = 0; i < keys_in.size(); i++) { | |||
| pairs.emplace_back(keys_in[i], values_in[i]); | |||
| } | |||
| if (descending) { | |||
| std::stable_sort(pairs.begin(), pairs.end(), | |||
| [](const auto& a, const auto& b) { return a.first > b.first; }); | |||
| } else { | |||
| std::stable_sort(pairs.begin(), pairs.end()); | |||
| } | |||
| keys_out.resize(k); | |||
| values_out.resize(k); | |||
| for (int i = 0; i < k; i++) { | |||
| keys_out[i] = pairs[i].first; | |||
| values_out[i] = pairs[i].second; | |||
| } | |||
| } | |||
| // CPU参考实现 - ReduceSum (使用double精度) | |||
| template<typename InputT> | |||
| double cpuReduceSum(const std::vector<InputT>& data, double init_value) { | |||
| double sum = init_value; | |||
| for (const auto& val : data) { | |||
| if constexpr (std::is_same_v<InputT, half>) { | |||
| float f_val = __half2float(val); | |||
| if (!std::isnan(f_val)) { | |||
| sum += static_cast<double>(f_val); | |||
| } | |||
| } else { | |||
| if (!std::isnan(val)) { | |||
| sum += static_cast<double>(val); | |||
| } | |||
| } | |||
| } | |||
| return sum; | |||
| } | |||
| @@ -0,0 +1,154 @@ | |||
| #pragma once | |||
| #include <fstream> | |||
| #include <vector> | |||
| #include <map> | |||
| #include <string> | |||
| #include <chrono> | |||
| #include <iomanip> | |||
| #include <sstream> | |||
| // ============================================================================ | |||
| // YAML性能报告生成器 | |||
| // ============================================================================ | |||
| class YAMLPerformanceReporter { | |||
| public: | |||
| struct PerformanceData { | |||
| std::string algorithm; | |||
| std::string input_type; | |||
| std::string output_type; | |||
| std::string key_type; | |||
| std::string value_type; | |||
| std::vector<std::map<std::string, std::string>> metrics; | |||
| }; | |||
| // 创建性能数据条目 | |||
| static std::map<std::string, std::string> createEntry() { | |||
| return std::map<std::string, std::string>(); | |||
| } | |||
| // 生成ReduceSum性能YAML | |||
| static void generateReduceSumYAML(const std::vector<std::map<std::string, std::string>>& perf_data, | |||
| const std::string& filename = "reduce_sum_performance.yaml") { | |||
| std::ofstream yaml_file(filename); | |||
| // 写入头部信息 | |||
| writeHeader(yaml_file, "ReduceSum算法性能测试结果"); | |||
| // 算法信息 | |||
| yaml_file << "algorithm: \"ReduceSum\"\n"; | |||
| yaml_file << "data_types:\n"; | |||
| yaml_file << " input: \"float\"\n"; | |||
| yaml_file << " output: \"float\"\n"; | |||
| // 计算公式 | |||
| yaml_file << "formulas:\n"; | |||
| yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; | |||
| // 性能数据 | |||
| yaml_file << "performance_data:\n"; | |||
| for (const auto& data : perf_data) { | |||
| yaml_file << " - data_size: " << data.at("data_size") << "\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("throughput_gps")) << "\n"; | |||
| yaml_file << " data_type: \"" << data.at("data_type") << "\"\n"; | |||
| } | |||
| yaml_file.close(); | |||
| } | |||
| // 生成SortPair性能YAML | |||
| static void generateSortPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data, | |||
| const std::string& filename = "sort_pair_performance.yaml") { | |||
| std::ofstream yaml_file(filename); | |||
| // 写入头部信息 | |||
| writeHeader(yaml_file, "SortPair算法性能测试结果"); | |||
| // 算法信息 | |||
| yaml_file << "algorithm: \"SortPair\"\n"; | |||
| yaml_file << "data_types:\n"; | |||
| yaml_file << " key_type: \"float\"\n"; | |||
| yaml_file << " value_type: \"uint32_t\"\n"; | |||
| // 计算公式 | |||
| yaml_file << "formulas:\n"; | |||
| yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; | |||
| // 性能数据 | |||
| yaml_file << "performance_data:\n"; | |||
| for (const auto& data : perf_data) { | |||
| yaml_file << " - data_size: " << data.at("data_size") << "\n"; | |||
| yaml_file << " ascending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n"; | |||
| yaml_file << " descending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n"; | |||
| yaml_file << " key_type: \"" << data.at("key_type") << "\"\n"; | |||
| yaml_file << " value_type: \"" << data.at("value_type") << "\"\n"; | |||
| } | |||
| yaml_file.close(); | |||
| } | |||
| // 生成TopkPair性能YAML | |||
| static void generateTopkPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data, | |||
| const std::string& filename = "topk_pair_performance.yaml") { | |||
| std::ofstream yaml_file(filename); | |||
| // 写入头部信息 | |||
| writeHeader(yaml_file, "TopkPair算法性能测试结果"); | |||
| // 算法信息 | |||
| yaml_file << "algorithm: \"TopkPair\"\n"; | |||
| yaml_file << "data_types:\n"; | |||
| yaml_file << " key_type: \"float\"\n"; | |||
| yaml_file << " value_type: \"uint32_t\"\n"; | |||
| // 计算公式 | |||
| yaml_file << "formulas:\n"; | |||
| yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; | |||
| // 性能数据 | |||
| yaml_file << "performance_data:\n"; | |||
| for (const auto& data : perf_data) { | |||
| yaml_file << " - data_size: " << data.at("data_size") << "\n"; | |||
| yaml_file << " k_value: " << data.at("k_value") << "\n"; | |||
| yaml_file << " ascending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n"; | |||
| yaml_file << " descending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n"; | |||
| yaml_file << " key_type: \"" << data.at("key_type") << "\"\n"; | |||
| yaml_file << " value_type: \"" << data.at("value_type") << "\"\n"; | |||
| } | |||
| yaml_file.close(); | |||
| } | |||
| private: | |||
| // 写入YAML文件头部 | |||
| static void writeHeader(std::ofstream& file, const std::string& title) { | |||
| file << "# " << title << "\n"; | |||
| file << "# 生成时间: "; | |||
| auto now = std::chrono::system_clock::now(); | |||
| auto time_t = std::chrono::system_clock::to_time_t(now); | |||
| file << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S"); | |||
| file << "\n\n"; | |||
| } | |||
| // 格式化浮点数 | |||
| static std::string formatFloat(const std::string& value) { | |||
| try { | |||
| double d = std::stod(value); | |||
| std::ostringstream oss; | |||
| oss << std::fixed << std::setprecision(6) << d; | |||
| return oss.str(); | |||
| } catch (...) { | |||
| return value; | |||
| } | |||
| } | |||
| }; | |||
| @@ -0,0 +1,103 @@ | |||
| # 题目: | |||
| ## GPU高性能并行计算算法优化 | |||
| 要求参赛者通过一个或多个global kernel 函数(允许配套 device 辅助函数),实现高性能算法。 | |||
| 在正确性、稳定性前提下,比拼算法性能。 | |||
| # 1. ReduceSum算法优化 | |||
| ```cpp | |||
| template <typename InputT = float, typename OutputT = float> | |||
| class ReduceSumAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) { | |||
| // TODO | |||
| } | |||
| }; | |||
| ``` | |||
| 其中 | |||
| * 数据类型:InputT: float, OutputT: float | |||
| * 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 | |||
| * 假定输入d\_in数据量为num\_items | |||
| 注意事项 | |||
| * 累计误差不大于cpu double golden基准的0.5% | |||
| * 注意针对NAN和INF等异常值的处理 | |||
| 加分项 | |||
| * 使用tensor core计算reduce | |||
| * 覆盖更全面的数据范围,提供良好稳定的性能表现 | |||
| # 2. Sort Pair算法优化 | |||
| ```cpp | |||
| template <typename KeyType, typename ValueType> | |||
| class SortPairAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void sort(const KeyType* d_keys_in, KeyType* d_keys_out, | |||
| const ValueType* d_values_in, ValueType* d_values_out, | |||
| int num_items, bool descending) { | |||
| // TODO | |||
| } | |||
| }; | |||
| ``` | |||
| 其中 | |||
| * 数据类型:key: float, value: int32\_t | |||
| * 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 | |||
| * 假定输入、输出的key和value的数据量一致,均为num\_items | |||
| 注意事项 | |||
| * 需要校验结果正确性 | |||
| * 结果必须稳定排序 | |||
| 加分项 | |||
| * 支持其他不同数据类型的排序,如half、double、int32_t等 | |||
| * 覆盖更全面的数据范围,提供良好稳定的性能表现 | |||
| # 3. Topk Pair算法优化 | |||
| ```cpp | |||
| template <typename KeyType, typename ValueType> | |||
| class TopkPairAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void topk(const KeyType* d_keys_in, KeyType* d_keys_out, | |||
| const ValueType* d_values_in, ValueType* d_values_out, | |||
| int num_items, int k, bool descending) { | |||
| // TODO | |||
| } | |||
| }; | |||
| ``` | |||
| 其中 | |||
| * 数据类型:key: float, value: int32\_t | |||
| * 系统将测试评估1M, 128M, 512M, 1G element number下的算法性能 | |||
| * 假定输入的key和value的数据量一致,为num\_items;输出的key和value的数据量一致,为k | |||
| * k的范围:32,50,100,256,1024。k不大于num\_items | |||
| 注意事项 | |||
| * 结果必须稳定排序 | |||
| 加分项 | |||
| * 支持其他不同数据类型的键值对,实现类型通用算法 | |||
| * 覆盖更全面的数据范围,提供良好稳定的性能表现 | |||
| @@ -0,0 +1,168 @@ | |||
| # GPU 高性能并行计算算法优化竞赛 | |||
| ## 🎯 竞赛概述 | |||
| 本竞赛旨在评估参赛者在GPU并行计算领域的算法优化能力。参赛者可选择实现三个核心算法的高性能版本: | |||
| - **ReduceSum**: 高精度归约求和 | |||
| - **SortPair**: 键值对稳定排序 | |||
| - **TopkPair**: 键值对TopK选择 | |||
| ## 🚀 快速开始 | |||
| ### 编译和测试 | |||
| #### 1. 全量编译和运行 | |||
| ```bash | |||
| # 编译并运行所有算法测试(默认行为) | |||
| ./build_and_run.sh | |||
| # 仅编译所有算法,不运行测试 | |||
| ./build_and_run.sh --build-only | |||
| # 编译并运行单个算法测试 | |||
| ./build_and_run.sh --run_reduce # ReduceSum算法 | |||
| ./build_and_run.sh --run_sort # SortPair算法 | |||
| ./build_and_run.sh --run_topk # TopkPair算法 | |||
| ``` | |||
| #### 2. 单独编译和运行 | |||
| ```bash | |||
| # 编译并运行ReduceSum算法(默认行为) | |||
| ./build_and_run_reduce_sum.sh | |||
| # 仅编译ReduceSum算法,不运行测试 | |||
| ./build_and_run_reduce_sum.sh --build-only | |||
| # 编译并运行SortPair正确性测试 | |||
| ./build_and_run_sort_pair.sh --run correctness | |||
| # 编译并运行TopkPair性能测试 | |||
| ./build_and_run_topk_pair.sh --run performance | |||
| ``` | |||
| #### 3. 手动运行测试 | |||
| ```bash | |||
| ./build/test_reducesum [correctness|performance|all] | |||
| ./build/test_sortpair [correctness|performance|all] | |||
| ./build/test_topkpair [correctness|performance|all] | |||
| ``` | |||
| ## 📝 参赛指南 | |||
| ### 实现位置 | |||
| 参赛者需要在以下文件中替换Thrust实现: | |||
| - `src/reduce_sum_algorithm.maca` - 替换Thrust归约求和 | |||
| - `src/sort_pair_algorithm.maca` - 替换Thrust稳定排序 | |||
| - `src/topk_pair_algorithm.maca` - 替换Thrust TopK选择 | |||
| ### 算法要求 | |||
| 见competition_parallel_algorithms.md | |||
| ## 📊 性能评测 | |||
| ### 测试流程 | |||
| 1. **Warmup**: 5次预热运行 | |||
| 2. **Benchmark**: 10次正式测试取平均 | |||
| 3. **数据规模**: 1M, 128M, 512M, 1G elements | |||
| 4. **评估指标**: 吞吐量(G/s) | |||
| ### 性能指标计算 | |||
| #### ReduceSum | |||
| - **数据类型**: float → float | |||
| - **吞吐量**: elements / time(s) / 1e9 (G/s) | |||
| #### SortPair | |||
| - **数据类型**: <float, uint32_t> | |||
| - **吞吐量**: elements / time(s) / 1e9 (G/s) | |||
| #### TopkPair | |||
| - **数据类型**: <float, uint32_t> | |||
| - **吞吐量**: elements / time(s) / 1e9 (G/s) | |||
| ### 性能结果文件 | |||
| 每个算法会生成详细的YAML性能分析文件: | |||
| - `reduce_sum_performance.yaml` - ReduceSum性能数据 | |||
| - `sort_pair_performance.yaml` - SortPair性能数据 | |||
| - `topk_pair_performance.yaml` - TopkPair性能数据 | |||
| 这些文件包含: | |||
| - 算法信息和数据类型 | |||
| - 计算公式说明 | |||
| - 各数据规模的详细性能数据 | |||
| - 升序/降序分别统计(适用时) | |||
| ## 📁 项目结构 | |||
| ``` | |||
| ├── build_and_run.sh # 统一编译和运行脚本(默认编译+运行所有算法) | |||
| ├── build_common.sh # 公共编译配置和函数 | |||
| ├── build_and_run_reduce_sum.sh # ReduceSum独立编译和运行脚本 | |||
| ├── build_and_run_sort_pair.sh # SortPair独立编译和运行脚本 | |||
| ├── build_and_run_topk_pair.sh # TopkPair独立编译和运行脚本 | |||
| ├── competition_parallel_algorithms.md # 详细题目说明 | |||
| ├── src/ # 算法实现和工具文件 | |||
| │ ├── reduce_sum_algorithm.maca # 1. ReduceSum测试程序 | |||
| │ ├── sort_pair_algorithm.maca # 2. SortPair测试程序 | |||
| │ ├── topk_pair_algorithm.maca # 3. TopkPair测试程序 | |||
| │ ├── test_utils.h # 测试工具和CPU参考实现 | |||
| │ ├── yaml_reporter.h # YAML性能报告生成器 | |||
| │ └── performance_utils.h # 性能测试工具 | |||
| ├── final_results/reduce_sum_results.yaml #ReduceSum性能数据 | |||
| ├── final_results/sort_pair_results.yaml #替换Thrust稳定排序 | |||
| └── final_results/topk_pair_results.yaml #TopkPair性能数据 | |||
| ``` | |||
| ## 🔧 开发工具 | |||
| ### 编译选项 | |||
| ```bash | |||
| # 默认编译命令 | |||
| mxcc -O3 -std=c++17 --extended-lambda -Isrc | |||
| ### 自动化测试 | |||
| ```bash | |||
| # 查看所有选项 | |||
| ./build.sh --help | |||
| # 运行所有测试并生成YAML报告 | |||
| ./build.sh --run_all | |||
| ### 环境变量配置 | |||
| | 变量 | 默认值 | 说明 | | |||
| |--------|--------|------| | |||
| | `COMPILER` | `mxcc` | CUDA编译器路径 | | |||
| | `COMPILER_FLAGS` | `-O3 -std=c++17 --extended-lambda` | 编译标志 | | |||
| | `INCLUDE_DIR` | `src` | 头文件目录 | | |||
| | `BUILD_DIR` | `build` | 构建输出目录 | | |||
| ### 调试模式 | |||
| ## 📋 提交清单 | |||
| 在提交前请确保: | |||
| - [ ] 所有算法通过正确性测试 | |||
| - [ ] 性能测试可以正常运行 | |||
| - [ ] 代码注释清晰,说明优化策略 | |||
| - [ ] 无内存泄漏或运行时错误 | |||
| - [ ] 生成完整测试报告 | |||
| - [ ] 在函数实现注释中说明创新点 | |||
| # 提交时包含以下文件 | |||
| # - final_results/reduce_sum_results.yaml | |||
| # - final_results/sort_pair_results.yaml | |||
| # - final_results/topk_pair_results.yaml | |||
| ``` | |||
| ## 🤝 技术支持 | |||
| 如有技术问题,请: | |||
| 1. 查看详细错误信息和GPU状态 | |||
| 2. 确认环境配置正确 | |||
| 3. 检查内存使用是否超限 | |||
| 4. 验证算法逻辑和数据类型 | |||
| --- | |||
| **祝您在竞赛中取得优异成绩!** 🏆 | |||
| @@ -0,0 +1,277 @@ | |||
| #include "test_utils.h" | |||
| #include "performance_utils.h" | |||
| #include "yaml_reporter.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| #include <iomanip> | |||
| // ============================================================================ | |||
| // 实现标记宏 - 参赛者修改实现时请将此宏设为0 | |||
| // ============================================================================ | |||
| #ifndef USE_DEFAULT_REF_IMPL | |||
| #define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 | |||
| #endif | |||
| #if USE_DEFAULT_REF_IMPL | |||
| #include <thrust/reduce.h> | |||
| #include <thrust/device_vector.h> | |||
| #include <thrust/execution_policy.h> | |||
| #include <thrust/functional.h> | |||
| #endif | |||
| // 误差容忍度 | |||
| constexpr double REDUCE_ERROR_TOLERANCE = 0.005; // 0.5% | |||
| // ============================================================================ | |||
| // ReduceSum算法实现接口 | |||
| // 参赛者需要替换Thrust实现为自己的高性能kernel | |||
| // ============================================================================ | |||
| template <typename InputT = float, typename OutputT = float> | |||
| class ReduceSumAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void reduce(const InputT* d_in, OutputT* d_out, int num_items, OutputT init_value) { | |||
| #if !USE_DEFAULT_REF_IMPL | |||
| // ======================================== | |||
| // 参赛者自定义实现区域 | |||
| // ======================================== | |||
| // TODO: 参赛者在此实现自己的高性能归约算法 | |||
| // 示例:参赛者可以调用1个或多个自定义kernel | |||
| // blockReduceKernel<<<grid, block>>>(d_in, temp_results, num_items, init_value); | |||
| // finalReduceKernel<<<1, block>>>(temp_results, d_out, grid.x); | |||
| #else | |||
| // ======================================== | |||
| // 默认基准实现 | |||
| // ======================================== | |||
| auto input_ptr = thrust::device_pointer_cast(d_in); | |||
| auto output_ptr = thrust::device_pointer_cast(d_out); | |||
| // 直接使用thrust::reduce进行归约 | |||
| *output_ptr = thrust::reduce( | |||
| thrust::device, | |||
| input_ptr, | |||
| input_ptr + num_items, | |||
| static_cast<OutputT>(init_value) | |||
| ); | |||
| #endif | |||
| } | |||
| // 获取当前实现状态 | |||
| static const char* getImplementationStatus() { | |||
| #if USE_DEFAULT_REF_IMPL | |||
| return "DEFAULT_REF_IMPL"; | |||
| #else | |||
| return "CUSTOM_IMPL"; | |||
| #endif | |||
| } | |||
| private: | |||
| // 参赛者可以在这里添加辅助函数和成员变量 | |||
| // 例如:中间结果缓冲区、多阶段归约等 | |||
| }; | |||
| // ============================================================================ | |||
| // 测试和性能评估 | |||
| // ============================================================================ | |||
| bool testCorrectness() { | |||
| std::cout << "ReduceSum 正确性测试..." << std::endl; | |||
| TestDataGenerator generator; | |||
| ReduceSumAlgorithm<float, float> algorithm; | |||
| bool allPassed = true; | |||
| // 测试不同数据规模 | |||
| for (int i = 0; i < NUM_TEST_SIZES && i < 2; i++) { // 限制测试规模 | |||
| int size = std::min(TEST_SIZES[i], 10000); | |||
| std::cout << " 测试规模: " << size << std::endl; | |||
| // 测试普通数据 | |||
| { | |||
| auto data = generator.generateRandomFloats(size, -10.0f, 10.0f); | |||
| float init_value = 1.0f; | |||
| // CPU参考计算 | |||
| double cpu_result = cpuReduceSum(data, static_cast<double>(init_value)); | |||
| // GPU计算 | |||
| float *d_in; | |||
| float *d_out; | |||
| MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_out, sizeof(float))); | |||
| MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| float gpu_result; | |||
| MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost)); | |||
| // 验证误差 | |||
| double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result); | |||
| if (relative_error > REDUCE_ERROR_TOLERANCE) { | |||
| std::cout << " 失败: 误差过大 " << relative_error << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过 (误差: " << relative_error << ")" << std::endl; | |||
| } | |||
| mcFree(d_in); | |||
| mcFree(d_out); | |||
| } | |||
| // 测试特殊值 (NaN, Inf) | |||
| if (size > 100) { | |||
| std::cout << " 测试特殊值..." << std::endl; | |||
| auto data = generator.generateSpecialFloats(size); | |||
| float init_value = 0.0f; | |||
| double cpu_result = cpuReduceSum(data, static_cast<double>(init_value)); | |||
| float *d_in; | |||
| float *d_out; | |||
| MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_out, sizeof(float))); | |||
| MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| float gpu_result; | |||
| MACA_CHECK(mcMemcpy(&gpu_result, d_out, sizeof(float), mcMemcpyDeviceToHost)); | |||
| // 对于包含特殊值的情况,检查是否正确处理 | |||
| if (std::isfinite(cpu_result) && std::isfinite(gpu_result)) { | |||
| double relative_error = std::abs(gpu_result - cpu_result) / std::abs(cpu_result); | |||
| if (relative_error > REDUCE_ERROR_TOLERANCE) { | |||
| std::cout << " 失败: 特殊值处理错误" << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过 (特殊值处理)" << std::endl; | |||
| } | |||
| } else { | |||
| std::cout << " 通过 (特殊值结果)" << std::endl; | |||
| } | |||
| mcFree(d_in); | |||
| mcFree(d_out); | |||
| } | |||
| } | |||
| return allPassed; | |||
| } | |||
| void benchmarkPerformance() { | |||
| PerformanceDisplay::printReduceSumHeader(); | |||
| TestDataGenerator generator; | |||
| PerformanceMeter meter; | |||
| ReduceSumAlgorithm<float, float> algorithm; | |||
| const int WARMUP_ITERATIONS = 5; | |||
| const int BENCHMARK_ITERATIONS = 10; | |||
| // 用于YAML报告的数据收集 | |||
| std::vector<std::map<std::string, std::string>> perf_data; | |||
| for (int i = 0; i < NUM_TEST_SIZES; i++) { | |||
| int size = TEST_SIZES[i]; | |||
| // 生成测试数据 | |||
| auto data = generator.generateRandomFloats(size); | |||
| float init_value = 0.0f; | |||
| // 分配GPU内存 | |||
| float *d_in; | |||
| float *d_out; | |||
| MACA_CHECK(mcMalloc(&d_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_out, sizeof(float))); | |||
| MACA_CHECK(mcMemcpy(d_in, data.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| // Warmup阶段 | |||
| for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| } | |||
| // 正式测试阶段 | |||
| float total_time = 0; | |||
| for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { | |||
| meter.startTiming(); | |||
| algorithm.reduce(d_in, d_out, size, init_value); | |||
| total_time += meter.stopTiming(); | |||
| } | |||
| float avg_time = total_time / BENCHMARK_ITERATIONS; | |||
| // 计算性能指标 | |||
| auto metrics = PerformanceCalculator::calculateReduceSum(size, avg_time); | |||
| // 显示性能数据 | |||
| PerformanceDisplay::printReduceSumData(size, avg_time, metrics); | |||
| // 收集YAML报告数据 | |||
| auto entry = YAMLPerformanceReporter::createEntry(); | |||
| entry["data_size"] = std::to_string(size); | |||
| entry["time_ms"] = std::to_string(avg_time); | |||
| entry["throughput_gps"] = std::to_string(metrics.throughput_gps); | |||
| entry["data_type"] = "float"; | |||
| perf_data.push_back(entry); | |||
| mcFree(d_in); | |||
| mcFree(d_out); | |||
| } | |||
| // 生成YAML性能报告 | |||
| YAMLPerformanceReporter::generateReduceSumYAML(perf_data, "reduce_sum_performance.yaml"); | |||
| PerformanceDisplay::printSavedMessage("reduce_sum_performance.yaml"); | |||
| } | |||
| // ============================================================================ | |||
| // 主函数 | |||
| // ============================================================================ | |||
| int main(int argc, char* argv[]) { | |||
| std::cout << "=== ReduceSum 算法测试 ===" << std::endl; | |||
| // 检查参数 | |||
| std::string mode = "all"; | |||
| if (argc > 1) { | |||
| mode = argv[1]; | |||
| } | |||
| bool correctness_passed = true; | |||
| bool performance_completed = true; | |||
| try { | |||
| if (mode == "correctness" || mode == "all") { | |||
| correctness_passed = testCorrectness(); | |||
| } | |||
| if (mode == "performance" || mode == "all") { | |||
| if (correctness_passed || mode == "performance") { | |||
| benchmarkPerformance(); | |||
| } else { | |||
| std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; | |||
| performance_completed = false; | |||
| } | |||
| } | |||
| std::cout << "\n=== 测试完成 ===" << std::endl; | |||
| std::cout << "实现状态: " << ReduceSumAlgorithm<float, float>::getImplementationStatus() << std::endl; | |||
| if (mode == "all") { | |||
| std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; | |||
| std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; | |||
| } | |||
| return correctness_passed ? 0 : 1; | |||
| } catch (const std::exception& e) { | |||
| std::cerr << "测试出错: " << e.what() << std::endl; | |||
| return 1; | |||
| } | |||
| } | |||
| @@ -0,0 +1,275 @@ | |||
| #include "test_utils.h" | |||
| #include "performance_utils.h" | |||
| #include "yaml_reporter.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| #include <iomanip> | |||
| // ============================================================================ | |||
| // 实现标记宏 - 参赛者修改实现时请将此宏设为0 | |||
| // ============================================================================ | |||
| #ifndef USE_DEFAULT_REF_IMPL | |||
| #define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 | |||
| #endif | |||
| #if USE_DEFAULT_REF_IMPL | |||
| #include <thrust/sort.h> | |||
| #include <thrust/device_vector.h> | |||
| #include <thrust/execution_policy.h> | |||
| #include <thrust/iterator/zip_iterator.h> | |||
| #include <thrust/tuple.h> | |||
| #endif | |||
| // ============================================================================ | |||
| // SortPair算法实现接口 | |||
| // 参赛者需要替换Thrust实现为自己的高性能kernel | |||
| // ============================================================================ | |||
| template <typename KeyType, typename ValueType> | |||
| class SortPairAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void sort(const KeyType* d_keys_in, KeyType* d_keys_out, | |||
| const ValueType* d_values_in, ValueType* d_values_out, | |||
| int num_items, bool descending) { | |||
| #if !USE_DEFAULT_REF_IMPL | |||
| // ======================================== | |||
| // 参赛者自定义实现区域 | |||
| // ======================================== | |||
| // TODO: 参赛者在此实现自己的高性能排序算法 | |||
| // 示例:参赛者可以调用1个或多个自定义kernel | |||
| // preprocessKernel<<<grid, block>>>(d_keys_in, d_values_in, num_items); | |||
| // mainSortKernel<<<grid, block>>>(d_keys_out, d_values_out, num_items, descending); | |||
| // postprocessKernel<<<grid, block>>>(d_keys_out, d_values_out, num_items); | |||
| #else | |||
| // ======================================== | |||
| // 默认基准实现 | |||
| // ======================================== | |||
| MACA_CHECK(mcMemcpy(d_keys_out, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_out, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice)); | |||
| auto key_ptr = thrust::device_pointer_cast(d_keys_out); | |||
| auto value_ptr = thrust::device_pointer_cast(d_values_out); | |||
| if (descending) { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater<KeyType>()); | |||
| } else { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less<KeyType>()); | |||
| } | |||
| #endif | |||
| } | |||
| // 获取当前实现状态 | |||
| static const char* getImplementationStatus() { | |||
| #if USE_DEFAULT_REF_IMPL | |||
| return "DEFAULT_REF_IMPL"; | |||
| #else | |||
| return "CUSTOM_IMPL"; | |||
| #endif | |||
| } | |||
| private: | |||
| // 参赛者可以在这里添加辅助函数和成员变量 | |||
| // 例如:临时缓冲区、多个kernel函数、流等 | |||
| }; | |||
| // ============================================================================ | |||
| // 测试和性能评估 | |||
| // ============================================================================ | |||
| bool testCorrectness() { | |||
| std::cout << "SortPair 正确性测试..." << std::endl; | |||
| TestDataGenerator generator; | |||
| SortPairAlgorithm<float, uint32_t> algorithm; | |||
| // 测试小规模数据 | |||
| int size = 10000; | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in, *d_keys_out; | |||
| uint32_t *d_values_in, *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| // 测试升序和降序 | |||
| bool allPassed = true; | |||
| for (bool descending : {false, true}) { | |||
| std::cout << " " << (descending ? "降序" : "升序") << " 测试..." << std::endl; | |||
| // CPU参考结果 | |||
| auto cpu_keys = keys; | |||
| auto cpu_values = values; | |||
| cpuSortPair(cpu_keys, cpu_values, descending); | |||
| // GPU算法结果 | |||
| algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); | |||
| // 获取结果 | |||
| std::vector<float> gpu_keys(size); | |||
| std::vector<uint32_t> gpu_values(size); | |||
| MACA_CHECK(mcMemcpy(gpu_keys.data(), d_keys_out, size * sizeof(float), mcMemcpyDeviceToHost)); | |||
| MACA_CHECK(mcMemcpy(gpu_values.data(), d_values_out, size * sizeof(uint32_t), mcMemcpyDeviceToHost)); | |||
| // 验证结果 | |||
| bool keysMatch = compareArrays(cpu_keys, gpu_keys, 1e-5); | |||
| bool valuesMatch = compareArrays(cpu_values, gpu_values); | |||
| if (!keysMatch || !valuesMatch) { | |||
| std::cout << " 失败: 结果不匹配" << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过" << std::endl; | |||
| } | |||
| } | |||
| // 清理内存 | |||
| mcFree(d_keys_in); | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_in); | |||
| mcFree(d_values_out); | |||
| return allPassed; | |||
| } | |||
| void benchmarkPerformance() { | |||
| PerformanceDisplay::printSortPairHeader(); | |||
| TestDataGenerator generator; | |||
| PerformanceMeter meter; | |||
| SortPairAlgorithm<float, uint32_t> algorithm; | |||
| const int WARMUP_ITERATIONS = 5; | |||
| const int BENCHMARK_ITERATIONS = 10; | |||
| // 用于YAML报告的数据收集 | |||
| std::vector<std::map<std::string, std::string>> perf_data; | |||
| for (int i = 0; i < NUM_TEST_SIZES; i++) { | |||
| int size = TEST_SIZES[i]; | |||
| // 生成测试数据 | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in, *d_keys_out; | |||
| uint32_t *d_values_in, *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_keys_out, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| float asc_time = 0, desc_time = 0; | |||
| // 测试升序和降序 | |||
| for (bool descending : {false, true}) { | |||
| // Warmup阶段 | |||
| for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { | |||
| algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); | |||
| } | |||
| // 正式测试阶段 | |||
| float total_time = 0; | |||
| for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { | |||
| meter.startTiming(); | |||
| algorithm.sort(d_keys_in, d_keys_out, d_values_in, d_values_out, size, descending); | |||
| total_time += meter.stopTiming(); | |||
| } | |||
| float avg_time = total_time / BENCHMARK_ITERATIONS; | |||
| if (descending) { | |||
| desc_time = avg_time; | |||
| } else { | |||
| asc_time = avg_time; | |||
| } | |||
| } | |||
| // 计算性能指标 | |||
| auto asc_metrics = PerformanceCalculator::calculateSortPair(size, asc_time); | |||
| auto desc_metrics = PerformanceCalculator::calculateSortPair(size, desc_time); | |||
| // 显示性能数据 | |||
| PerformanceDisplay::printSortPairData(size, asc_time, desc_time, asc_metrics, desc_metrics); | |||
| // 收集YAML报告数据 | |||
| auto entry = YAMLPerformanceReporter::createEntry(); | |||
| entry["data_size"] = std::to_string(size); | |||
| entry["asc_time_ms"] = std::to_string(asc_time); | |||
| entry["desc_time_ms"] = std::to_string(desc_time); | |||
| entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps); | |||
| entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps); | |||
| entry["key_type"] = "float"; | |||
| entry["value_type"] = "uint32_t"; | |||
| perf_data.push_back(entry); | |||
| // 清理内存 | |||
| mcFree(d_keys_in); | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_in); | |||
| mcFree(d_values_out); | |||
| } | |||
| // 生成YAML性能报告 | |||
| YAMLPerformanceReporter::generateSortPairYAML(perf_data, "sort_pair_performance.yaml"); | |||
| PerformanceDisplay::printSavedMessage("sort_pair_performance.yaml"); | |||
| } | |||
| // ============================================================================ | |||
| // 主函数 | |||
| // ============================================================================ | |||
| int main(int argc, char* argv[]) { | |||
| std::cout << "=== SortPair 算法测试 ===" << std::endl; | |||
| // 检查参数 | |||
| std::string mode = "all"; | |||
| if (argc > 1) { | |||
| mode = argv[1]; | |||
| } | |||
| bool correctness_passed = true; | |||
| bool performance_completed = true; | |||
| try { | |||
| if (mode == "correctness" || mode == "all") { | |||
| correctness_passed = testCorrectness(); | |||
| } | |||
| if (mode == "performance" || mode == "all") { | |||
| if (correctness_passed || mode == "performance") { | |||
| benchmarkPerformance(); | |||
| } else { | |||
| std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; | |||
| performance_completed = false; | |||
| } | |||
| } | |||
| std::cout << "\n=== 测试完成 ===" << std::endl; | |||
| std::cout << "实现状态: " << SortPairAlgorithm<float, uint32_t>::getImplementationStatus() << std::endl; | |||
| if (mode == "all") { | |||
| std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; | |||
| std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; | |||
| } | |||
| return correctness_passed ? 0 : 1; | |||
| } catch (const std::exception& e) { | |||
| std::cerr << "测试出错: " << e.what() << std::endl; | |||
| return 1; | |||
| } | |||
| } | |||
| @@ -0,0 +1,317 @@ | |||
| #include "test_utils.h" | |||
| #include "performance_utils.h" | |||
| #include "yaml_reporter.h" | |||
| #include <iostream> | |||
| #include <vector> | |||
| #include <iomanip> | |||
| #include <fstream> | |||
| #include <map> | |||
| #include <chrono> | |||
| // ============================================================================ | |||
| // 实现标记宏 - 参赛者修改实现时请将此宏设为0 | |||
| // ============================================================================ | |||
| #ifndef USE_DEFAULT_REF_IMPL | |||
| #define USE_DEFAULT_REF_IMPL 1 // 1=默认实现, 0=参赛者自定义实现 | |||
| #endif | |||
| #if USE_DEFAULT_REF_IMPL | |||
| #include <thrust/sort.h> | |||
| #include <thrust/device_vector.h> | |||
| #include <thrust/execution_policy.h> | |||
| #include <thrust/iterator/zip_iterator.h> | |||
| #include <thrust/tuple.h> | |||
| #include <thrust/copy.h> | |||
| #endif | |||
| static const int TOPK_VALUES[] = {32, 50, 100, 256, 1024}; | |||
| static const int NUM_TOPK_VALUES = sizeof(TOPK_VALUES) / sizeof(TOPK_VALUES[0]); | |||
| // ============================================================================ | |||
| // TopkPair算法实现接口 | |||
| // 参赛者需要替换Thrust实现为自己的高性能kernel | |||
| // ============================================================================ | |||
| template <typename KeyType, typename ValueType> | |||
| class TopkPairAlgorithm { | |||
| public: | |||
| // 主要接口函数 - 参赛者需要实现这个函数 | |||
| void topk(const KeyType* d_keys_in, KeyType* d_keys_out, | |||
| const ValueType* d_values_in, ValueType* d_values_out, | |||
| int num_items, int k, bool descending) { | |||
| #if !USE_DEFAULT_REF_IMPL | |||
| // ======================================== | |||
| // 参赛者自定义实现区域 | |||
| // ======================================== | |||
| // TODO: 参赛者在此实现自己的高性能TopK算法 | |||
| // 示例:参赛者可以调用多个自定义kernel | |||
| // TopkKernel1<<<grid, block>>>(d_keys_in, d_values_in, temp_results, num_items, k); | |||
| // TopkKernel2<<<grid, block>>>(temp_results, d_keys_out, d_values_out, k, descending); | |||
| #else | |||
| // ======================================== | |||
| // 默认基准实现 | |||
| // ======================================== | |||
| KeyType* temp_keys; | |||
| ValueType* temp_values; | |||
| MACA_CHECK(mcMalloc(&temp_keys, num_items * sizeof(KeyType))); | |||
| MACA_CHECK(mcMalloc(&temp_values, num_items * sizeof(ValueType))); | |||
| MACA_CHECK(mcMemcpy(temp_keys, d_keys_in, num_items * sizeof(KeyType), mcMemcpyDeviceToDevice)); | |||
| MACA_CHECK(mcMemcpy(temp_values, d_values_in, num_items * sizeof(ValueType), mcMemcpyDeviceToDevice)); | |||
| auto key_ptr = thrust::device_pointer_cast(temp_keys); | |||
| auto value_ptr = thrust::device_pointer_cast(temp_values); | |||
| // 由于greater和less是不同类型,需要分别调用 | |||
| if (descending) { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::greater<KeyType>()); | |||
| } else { | |||
| thrust::stable_sort_by_key(thrust::device, key_ptr, key_ptr + num_items, value_ptr, thrust::less<KeyType>()); | |||
| } | |||
| MACA_CHECK(mcMemcpy(d_keys_out, temp_keys, k * sizeof(KeyType), mcMemcpyDeviceToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_out, temp_values, k * sizeof(ValueType), mcMemcpyDeviceToDevice)); | |||
| mcFree(temp_keys); | |||
| mcFree(temp_values); | |||
| #endif | |||
| } | |||
| // 获取当前实现状态 | |||
| static const char* getImplementationStatus() { | |||
| #if USE_DEFAULT_REF_IMPL | |||
| return "DEFAULT_REF_IMPL"; | |||
| #else | |||
| return "CUSTOM_IMPL"; | |||
| #endif | |||
| } | |||
| private: | |||
| // 参赛者可以在这里添加辅助函数和成员变量 | |||
| // 例如:分块大小、临时缓冲区、多流处理等 | |||
| }; | |||
| // ============================================================================ | |||
| // 测试和性能评估 | |||
| // ============================================================================ | |||
| bool testCorrectness() { | |||
| std::cout << "TopkPair 正确性测试..." << std::endl; | |||
| TestDataGenerator generator; | |||
| TopkPairAlgorithm<float, uint32_t> algorithm; | |||
| int size = 10000; | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in, *d_keys_out; | |||
| uint32_t *d_values_in, *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| bool allPassed = true; | |||
| // 测试不同k值 | |||
| for (int ki = 0; ki < NUM_TOPK_VALUES && ki < 4; ki++) { // 限制测试范围 | |||
| int k = TOPK_VALUES[ki]; | |||
| if (k > size) continue; | |||
| std::cout << " 测试 k=" << k << std::endl; | |||
| MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t))); | |||
| for (bool descending : {false, true}) { | |||
| std::cout << " " << (descending ? "降序" : "升序") << " TopK..." << std::endl; | |||
| // CPU参考结果 | |||
| std::vector<float> cpu_keys_out; | |||
| std::vector<uint32_t> cpu_values_out; | |||
| cpuTopkPair(keys, values, cpu_keys_out, cpu_values_out, k, descending); | |||
| // GPU算法结果 | |||
| algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); | |||
| // 获取结果 | |||
| std::vector<float> gpu_keys_out(k); | |||
| std::vector<uint32_t> gpu_values_out(k); | |||
| MACA_CHECK(mcMemcpy(gpu_keys_out.data(), d_keys_out, k * sizeof(float), mcMemcpyDeviceToHost)); | |||
| MACA_CHECK(mcMemcpy(gpu_values_out.data(), d_values_out, k * sizeof(uint32_t), mcMemcpyDeviceToHost)); | |||
| // 验证结果 | |||
| bool keysMatch = compareArrays(cpu_keys_out, gpu_keys_out, 1e-5); | |||
| bool valuesMatch = compareArrays(cpu_values_out, gpu_values_out); | |||
| if (!keysMatch || !valuesMatch) { | |||
| std::cout << " 失败: 结果不匹配" << std::endl; | |||
| allPassed = false; | |||
| } else { | |||
| std::cout << " 通过" << std::endl; | |||
| } | |||
| } | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_out); | |||
| } | |||
| // 清理内存 | |||
| mcFree(d_keys_in); | |||
| mcFree(d_values_in); | |||
| return allPassed; | |||
| } | |||
| void benchmarkPerformance() { | |||
| std::cout << "\nTopkPair 性能测试..." << std::endl; | |||
| std::cout << "数据类型: <float, uint32_t>" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| TestDataGenerator generator; | |||
| PerformanceMeter meter; | |||
| TopkPairAlgorithm<float, uint32_t> algorithm; | |||
| const int WARMUP_ITERATIONS = 5; | |||
| const int BENCHMARK_ITERATIONS = 10; | |||
| // 用于YAML报告的数据收集 | |||
| std::vector<std::map<std::string, std::string>> perf_data; | |||
| // 针对不同数据规模测试 | |||
| for (int size_idx = 0; size_idx < NUM_TEST_SIZES; size_idx++) { | |||
| int size = TEST_SIZES[size_idx]; | |||
| std::cout << "\n数据规模: " << size << std::endl; | |||
| std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" | |||
| << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; | |||
| std::cout << std::string(74, '-') << std::endl; | |||
| auto keys = generator.generateRandomFloats(size); | |||
| auto values = generator.generateRandomUint32(size); | |||
| // 分配GPU内存 | |||
| float *d_keys_in; | |||
| uint32_t *d_values_in; | |||
| MACA_CHECK(mcMalloc(&d_keys_in, size * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_in, size * sizeof(uint32_t))); | |||
| MACA_CHECK(mcMemcpy(d_keys_in, keys.data(), size * sizeof(float), mcMemcpyHostToDevice)); | |||
| MACA_CHECK(mcMemcpy(d_values_in, values.data(), size * sizeof(uint32_t), mcMemcpyHostToDevice)); | |||
| for (int ki = 0; ki < NUM_TOPK_VALUES; ki++) { | |||
| int k = TOPK_VALUES[ki]; | |||
| if (k > size) continue; | |||
| float *d_keys_out; | |||
| uint32_t *d_values_out; | |||
| MACA_CHECK(mcMalloc(&d_keys_out, k * sizeof(float))); | |||
| MACA_CHECK(mcMalloc(&d_values_out, k * sizeof(uint32_t))); | |||
| float asc_time = 0, desc_time = 0; | |||
| for (bool descending : {false, true}) { | |||
| // Warmup阶段 | |||
| for (int iter = 0; iter < WARMUP_ITERATIONS; iter++) { | |||
| algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); | |||
| } | |||
| // 正式测试阶段 | |||
| float total_time = 0; | |||
| for (int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) { | |||
| meter.startTiming(); | |||
| algorithm.topk(d_keys_in, d_keys_out, d_values_in, d_values_out, size, k, descending); | |||
| total_time += meter.stopTiming(); | |||
| } | |||
| float avg_time = total_time / BENCHMARK_ITERATIONS; | |||
| if (descending) { | |||
| desc_time = avg_time; | |||
| } else { | |||
| asc_time = avg_time; | |||
| } | |||
| } | |||
| // 计算性能指标 | |||
| auto asc_metrics = PerformanceCalculator::calculateTopkPair(size, k, asc_time); | |||
| auto desc_metrics = PerformanceCalculator::calculateTopkPair(size, k, desc_time); | |||
| // 显示性能数据 | |||
| PerformanceDisplay::printTopkPairData(k, asc_time, desc_time, asc_metrics, desc_metrics); | |||
| // 收集YAML报告数据 | |||
| auto entry = YAMLPerformanceReporter::createEntry(); | |||
| entry["data_size"] = std::to_string(size); | |||
| entry["k_value"] = std::to_string(k); | |||
| entry["asc_time_ms"] = std::to_string(asc_time); | |||
| entry["desc_time_ms"] = std::to_string(desc_time); | |||
| entry["asc_throughput_gps"] = std::to_string(asc_metrics.throughput_gps); | |||
| entry["desc_throughput_gps"] = std::to_string(desc_metrics.throughput_gps); | |||
| entry["key_type"] = "float"; | |||
| entry["value_type"] = "uint32_t"; | |||
| perf_data.push_back(entry); | |||
| mcFree(d_keys_out); | |||
| mcFree(d_values_out); | |||
| } | |||
| mcFree(d_keys_in); | |||
| mcFree(d_values_in); | |||
| } | |||
| // 生成YAML性能报告 | |||
| YAMLPerformanceReporter::generateTopkPairYAML(perf_data, "topk_pair_performance.yaml"); | |||
| PerformanceDisplay::printSavedMessage("topk_pair_performance.yaml"); | |||
| } | |||
| // ============================================================================ | |||
| // 主函数 | |||
| // ============================================================================ | |||
| int main(int argc, char* argv[]) { | |||
| std::cout << "=== TopkPair 算法测试 ===" << std::endl; | |||
| // 检查参数 | |||
| std::string mode = "all"; | |||
| if (argc > 1) { | |||
| mode = argv[1]; | |||
| } | |||
| bool correctness_passed = true; | |||
| bool performance_completed = true; | |||
| try { | |||
| if (mode == "correctness" || mode == "all") { | |||
| correctness_passed = testCorrectness(); | |||
| } | |||
| if (mode == "performance" || mode == "all") { | |||
| if (correctness_passed || mode == "performance") { | |||
| benchmarkPerformance(); | |||
| } else { | |||
| std::cout << "跳过性能测试,因为正确性测试未通过" << std::endl; | |||
| performance_completed = false; | |||
| } | |||
| } | |||
| std::cout << "\n=== 测试完成 ===" << std::endl; | |||
| std::cout << "实现状态: " << TopkPairAlgorithm<float, uint32_t>::getImplementationStatus() << std::endl; | |||
| if (mode == "all") { | |||
| std::cout << "正确性: " << (correctness_passed ? "通过" : "失败") << std::endl; | |||
| std::cout << "性能测试: " << (performance_completed ? "完成" : "跳过") << std::endl; | |||
| } | |||
| return correctness_passed ? 0 : 1; | |||
| } catch (const std::exception& e) { | |||
| std::cerr << "测试出错: " << e.what() << std::endl; | |||
| return 1; | |||
| } | |||
| } | |||
| @@ -0,0 +1,274 @@ | |||
| #!/bin/bash | |||
| # GPU高性能并行计算算法优化竞赛 - 统一编译和运行脚本 | |||
| # 整合了所有算法的编译、运行和公共配置 | |||
| # ============================================================================ | |||
| # 公共配置和工具函数 | |||
| # ============================================================================ | |||
| # 设置颜色 | |||
| RED='\033[0;31m' | |||
| GREEN='\033[0;32m' | |||
| BLUE='\033[0;34m' | |||
| YELLOW='\033[0;33m' | |||
| NC='\033[0m' # No Color | |||
| # 打印函数 | |||
| print_info() { | |||
| echo -e "${BLUE}[INFO]${NC} $1" | |||
| } | |||
| print_success() { | |||
| echo -e "${GREEN}[SUCCESS]${NC} $1" | |||
| } | |||
| print_error() { | |||
| echo -e "${RED}[ERROR]${NC} $1" | |||
| } | |||
| print_warning() { | |||
| echo -e "${YELLOW}[WARNING]${NC} $1" | |||
| } | |||
| # 编译配置 - 可通过环境变量自定义 | |||
| COMPILER=${COMPILER:-mxcc} | |||
| COMPILER_FLAGS=${COMPILER_FLAGS:-"-O3 -std=c++17 --extended-lambda -DRUN_FULL_TEST"} | |||
| # ***** 这里是关键修改点1:头文件目录 ***** | |||
| # 现在头文件在 includes/ 目录下 | |||
| HEADER_DIR=${HEADER_DIR:-utils} | |||
| # ***** 这里是关键修改点2:源文件目录 ***** | |||
| # 现在源文件在 algorithms/ 目录下 | |||
| SOURCE_CODE_DIR=${SOURCE_CODE_DIR:-} | |||
| BUILD_DIR=${BUILD_DIR:-build} | |||
| # 编译单个算法的通用函数 | |||
| # 参数: $1=算法名称, $2=源文件名(不含路径) | |||
| compile_algorithm() { | |||
| local algo_name="$1" | |||
| local source_file_name="$2" # 例如 "reduce_sum_algorithm.maca" | |||
| local target_file="$BUILD_DIR/test_${algo_name,,}" # 转换为小写 | |||
| print_info "编译 $algo_name 算法..." | |||
| # 创建构建目录 | |||
| mkdir -p "$BUILD_DIR" | |||
| # ***** 这里是关键修改点3:编译命令 ***** | |||
| # -I$HEADER_DIR 用于告诉编译器头文件在哪里 | |||
| # $SOURCE_CODE_DIR/$source_file_name 用于指定要编译的源文件的完整路径 | |||
| local compile_cmd="$COMPILER $COMPILER_FLAGS -I$HEADER_DIR $source_file_name -o $target_file" | |||
| print_info "执行: $compile_cmd" | |||
| if $compile_cmd; then | |||
| print_success "$algo_name 编译完成!" | |||
| echo "" | |||
| echo "运行测试:" | |||
| echo " ./$target_file [correctness|performance|all]" | |||
| return 0 | |||
| else | |||
| print_error "$algo_name 编译失败!" | |||
| return 1 | |||
| fi | |||
| } | |||
| # 显示编译配置信息 | |||
| show_build_config() { | |||
| print_info "编译配置:" | |||
| echo " COMPILER: $COMPILER" | |||
| echo " COMPILER_FLAGS: $COMPILER_FLAGS" | |||
| echo " HEADER_DIR: $HEADER_DIR" # 显示头文件目录 | |||
| echo " SOURCE_CODE_DIR: $SOURCE_CODE_DIR" # 显示源文件目录 | |||
| echo " BUILD_DIR: $BUILD_DIR" | |||
| echo "" | |||
| } | |||
| # 运行单个测试 | |||
| run_single_test() { | |||
| local algo_name="$1" | |||
| local test_mode="${2:-all}" | |||
| local test_file="$BUILD_DIR/test_${algo_name,,}" | |||
| if [ -f "$test_file" ]; then | |||
| print_info "运行 $algo_name 测试 (模式: $test_mode)..." | |||
| "./$test_file" "$test_mode" | |||
| return $? | |||
| else | |||
| print_error "$algo_name 测试程序不存在: $test_file" | |||
| return 1 | |||
| fi | |||
| } | |||
| # ============================================================================ | |||
| # 主脚本逻辑 | |||
| # ============================================================================ | |||
| # 显示帮助信息 (整合了所有选项) | |||
| show_help() { | |||
| echo "GPU算法竞赛统一编译和运行脚本" | |||
| echo "用法: $0 [选项]" | |||
| echo "" | |||
| echo "选项:" | |||
| echo " --help 显示帮助信息" | |||
| echo " --build-only 仅编译所有算法,不运行测试" | |||
| echo " --run_reduce [MODE] 编译并运行ReduceSum算法测试 (MODE: correctness|performance|all, 默认all)" | |||
| echo " --run_sort [MODE] 编译并运行SortPair算法测试 (MODE: correctness|performance|all, 默认all)" | |||
| echo " --run_topk [MODE] 编译并运行TopkPair算法测试 (MODE: correctness|performance|all, 默认all)" | |||
| echo "" | |||
| echo "示例:" | |||
| echo " $0 # 编译并运行所有测试(默认行为)" | |||
| echo " $0 --build-only # 仅编译所有算法" | |||
| echo " $0 --run_sort performance # 编译并运行SortPair性能测试" | |||
| echo "" | |||
| } | |||
| # 解析命令行参数 | |||
| RUN_MODE="run_all" # 默认为编译并运行所有测试 | |||
| ALGO_TO_RUN="" # 记录要运行的单个算法 | |||
| SINGLE_ALGO_TEST_MODE="all" # 单个算法的测试模式 | |||
| while [[ $# -gt 0 ]]; do | |||
| case $1 in | |||
| --help) | |||
| show_help | |||
| exit 0 | |||
| ;; | |||
| --build-only) | |||
| RUN_MODE="build_only" | |||
| shift | |||
| ;; | |||
| --run_reduce) | |||
| RUN_MODE="run_single" | |||
| ALGO_TO_RUN="ReduceSum" | |||
| if [[ -n "$2" && "$2" != --* ]]; then | |||
| SINGLE_ALGO_TEST_MODE="$2" | |||
| shift | |||
| fi | |||
| shift | |||
| ;; | |||
| --run_sort) | |||
| RUN_MODE="run_single" | |||
| ALGO_TO_RUN="SortPair" | |||
| if [[ -n "$2" && "$2" != --* ]]; then | |||
| SINGLE_ALGO_TEST_MODE="$2" | |||
| shift | |||
| fi | |||
| shift | |||
| ;; | |||
| --run_topk) | |||
| RUN_MODE="run_single" | |||
| ALGO_TO_RUN="TopkPair" | |||
| if [[ -n "$2" && "$2" != --* ]]; then | |||
| SINGLE_ALGO_TEST_MODE="$2" | |||
| shift | |||
| fi | |||
| shift | |||
| ;; | |||
| *) | |||
| print_error "未知选项: $1" | |||
| show_help | |||
| exit 1 | |||
| ;; | |||
| esac | |||
| done | |||
| if [ "$RUN_MODE" = "build_only" ]; then | |||
| print_info "开始编译所有算法..." | |||
| else | |||
| print_info "开始编译并运行所有算法..." | |||
| fi | |||
| print_info "工作目录: $(pwd)" | |||
| print_info "编译时间: $(date '+%Y-%m-%d %H:%M:%S')" | |||
| show_build_config | |||
| # 清理构建目录 | |||
| if [ -d "$BUILD_DIR" ]; then | |||
| print_info "清理现有构建目录: $BUILD_DIR" | |||
| rm -rf "$BUILD_DIR" | |||
| fi | |||
| # 核心逻辑:根据 RUN_MODE 执行操作 | |||
| case "$RUN_MODE" in | |||
| "build_only") | |||
| print_info "编译所有算法..." | |||
| # 直接调用 compile_algorithm 函数 | |||
| print_info "[1/3] 编译ReduceSum..." | |||
| if ! compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then | |||
| print_error "ReduceSum编译失败" | |||
| exit 1 | |||
| fi | |||
| print_info "[2/3] 编译SortPair..." | |||
| if ! compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then | |||
| print_error "SortPair编译失败" | |||
| exit 1 | |||
| fi | |||
| print_info "[3/3] 编译TopkPair..." | |||
| if ! compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then | |||
| print_error "TopkPair编译失败" | |||
| exit 1 | |||
| fi | |||
| print_success "所有算法编译完成!" | |||
| echo "" | |||
| echo "可执行文件:" | |||
| echo " $BUILD_DIR/test_reducesum - ReduceSum算法测试" | |||
| echo " $BUILD_DIR/test_sortpair - SortPair算法测试" | |||
| echo " $BUILD_DIR/test_topkpair - TopkPair算法测试" | |||
| echo "" | |||
| echo "使用方法:" | |||
| echo " ./$BUILD_DIR/test_reducesum [correctness|performance|all]" | |||
| echo " ./$BUILD_DIR/test_sortpair [correctness|performance|all]" | |||
| echo " ./$BUILD_DIR/test_topkpair [correctness|performance|all]" | |||
| ;; | |||
| "run_all") | |||
| print_info "编译并运行所有算法测试..." | |||
| # 直接调用 compile_algorithm 和 run_single_test 函数 | |||
| print_info "[1/3] ReduceSum..." | |||
| if compile_algorithm "ReduceSum" "reduce_sum_algorithm.maca"; then | |||
| run_single_test "ReduceSum" "all" | |||
| else | |||
| exit 1 | |||
| fi | |||
| print_info "[2/3] SortPair..." | |||
| if compile_algorithm "SortPair" "sort_pair_algorithm.maca"; then | |||
| run_single_test "SortPair" "all" | |||
| else | |||
| exit 1 | |||
| fi | |||
| print_info "[3/3] TopkPair..." | |||
| if compile_algorithm "TopkPair" "topk_pair_algorithm.maca"; then | |||
| run_single_test "TopkPair" "all" | |||
| else | |||
| exit 1 | |||
| fi | |||
| print_success "所有测试完成!" | |||
| ;; | |||
| "run_single") | |||
| print_info "编译并运行 ${ALGO_TO_RUN} 测试 (模式: ${SINGLE_ALGO_TEST_MODE})..." | |||
| local source_file_name="" | |||
| case "$ALGO_TO_RUN" in | |||
| "ReduceSum") source_file_name="reduce_sum_algorithm.maca" ;; | |||
| "SortPair") source_file_name="sort_pair_algorithm.maca" ;; | |||
| "TopkPair") source_file_name="topk_pair_algorithm.maca" ;; | |||
| esac | |||
| if compile_algorithm "$ALGO_TO_RUN" "$source_file_name"; then | |||
| run_single_test "$ALGO_TO_RUN" "$SINGLE_ALGO_TEST_MODE" | |||
| else | |||
| exit 1 | |||
| fi | |||
| ;; | |||
| esac | |||
| @@ -0,0 +1,114 @@ | |||
| #pragma once | |||
| #include <iostream> | |||
| #include <iomanip> | |||
| #include <string> | |||
| // ============================================================================ | |||
| // 性能计算和显示工具 | |||
| // ============================================================================ | |||
| class PerformanceCalculator { | |||
| public: | |||
| // ReduceSum性能计算 | |||
| struct ReduceSumMetrics { | |||
| double throughput_gps; // G elements/s | |||
| }; | |||
| static ReduceSumMetrics calculateReduceSum(int size, float time_ms) { | |||
| ReduceSumMetrics metrics; | |||
| metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); | |||
| return metrics; | |||
| } | |||
| // SortPair性能计算 | |||
| struct SortPairMetrics { | |||
| double throughput_gps; // G elements/s | |||
| }; | |||
| static SortPairMetrics calculateSortPair(int size, float time_ms) { | |||
| SortPairMetrics metrics; | |||
| metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); | |||
| return metrics; | |||
| } | |||
| // TopkPair性能计算 | |||
| struct TopkPairMetrics { | |||
| double throughput_gps; // G elements/s | |||
| }; | |||
| static TopkPairMetrics calculateTopkPair(int size, int k, float time_ms) { | |||
| TopkPairMetrics metrics; | |||
| metrics.throughput_gps = (size / 1e9) / (time_ms / 1000.0); | |||
| return metrics; | |||
| } | |||
| }; | |||
| // ============================================================================ | |||
| // 性能显示工具 | |||
| // ============================================================================ | |||
| class PerformanceDisplay { | |||
| public: | |||
| // 显示ReduceSum性能表头 | |||
| static void printReduceSumHeader() { | |||
| std::cout << "\nReduceSum 性能测试..." << std::endl; | |||
| std::cout << "数据类型: float -> float" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| std::cout << std::setw(12) << "数据规模" << std::setw(15) << "时间(ms)" | |||
| << std::setw(20) << "吞吐量(G/s)" << std::endl; | |||
| std::cout << std::string(47, '-') << std::endl; | |||
| } | |||
| // 显示SortPair性能表头 | |||
| static void printSortPairHeader() { | |||
| std::cout << "\nSortPair 性能测试..." << std::endl; | |||
| std::cout << "数据类型: <float, uint32_t>" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| std::cout << std::setw(12) << "数据规模" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" | |||
| << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; | |||
| std::cout << std::string(78, '-') << std::endl; | |||
| } | |||
| // 显示TopkPair性能表头 | |||
| static void printTopkPairHeader() { | |||
| std::cout << "\nTopkPair 性能测试..." << std::endl; | |||
| std::cout << "数据类型: <float, uint32_t>" << std::endl; | |||
| std::cout << "计算公式:" << std::endl; | |||
| std::cout << " 吞吐量 = 元素数 / 时间(s) / 1e9 (G/s)" << std::endl; | |||
| } | |||
| static void printTopkPairDataHeader() { | |||
| std::cout << std::setw(8) << "k值" << std::setw(15) << "升序(ms)" << std::setw(15) << "降序(ms)" | |||
| << std::setw(16) << "升序(G/s)" << std::setw(16) << "降序(G/s)" << std::endl; | |||
| std::cout << std::string(74, '-') << std::endl; | |||
| } | |||
| // 显示性能数据行 | |||
| static void printReduceSumData(int size, float time_ms, const PerformanceCalculator::ReduceSumMetrics& metrics) { | |||
| std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) | |||
| << time_ms << std::setw(20) << std::setprecision(3) << metrics.throughput_gps << std::endl; | |||
| } | |||
| static void printSortPairData(int size, float asc_time, float desc_time, | |||
| const PerformanceCalculator::SortPairMetrics& asc_metrics, | |||
| const PerformanceCalculator::SortPairMetrics& desc_metrics) { | |||
| std::cout << std::setw(12) << size << std::setw(15) << std::fixed << std::setprecision(3) | |||
| << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) | |||
| << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl; | |||
| } | |||
| static void printTopkPairData(int k, float asc_time, float desc_time, | |||
| const PerformanceCalculator::TopkPairMetrics& asc_metrics, | |||
| const PerformanceCalculator::TopkPairMetrics& desc_metrics) { | |||
| std::cout << std::setw(8) << k << std::setw(15) << std::fixed << std::setprecision(3) | |||
| << asc_time << std::setw(15) << desc_time << std::setw(16) << std::setprecision(3) | |||
| << asc_metrics.throughput_gps << std::setw(16) << desc_metrics.throughput_gps << std::endl; | |||
| } | |||
| // 显示性能文件保存消息 | |||
| static void printSavedMessage(const std::string& filename) { | |||
| std::cout << "\n性能结果已保存到: " << filename << std::endl; | |||
| } | |||
| }; | |||
| @@ -0,0 +1,234 @@ | |||
| #pragma once | |||
| #include <vector> | |||
| #include <random> | |||
| #include <algorithm> | |||
| #include <mc_runtime.h> | |||
| #include <maca_fp16.h> | |||
| #include <iostream> | |||
| #include <chrono> | |||
| #include <cmath> | |||
| // 引入模块化头文件 | |||
| #include "yaml_reporter.h" | |||
| #include "performance_utils.h" | |||
| // ============================================================================ | |||
| // 测试配置常量 | |||
| // ============================================================================ | |||
| #ifndef RUN_FULL_TEST | |||
| const int TEST_SIZES[] = {1000000, 134217728}; // 1M, 128M, 512M, 1G | |||
| #else | |||
| const int TEST_SIZES[] = {1000000, 134217728, 536870912, 1073741824}; // 1M, 128M, 512M, 1G | |||
| #endif | |||
| const int NUM_TEST_SIZES = sizeof(TEST_SIZES) / sizeof(TEST_SIZES[0]); | |||
| // 性能测试重复次数 | |||
| constexpr int WARMUP_ITERATIONS = 5; | |||
| constexpr int BENCHMARK_ITERATIONS = 10; | |||
| // ============================================================================ | |||
| // 错误检查宏 | |||
| // ============================================================================ | |||
| #define MACA_CHECK(call) \ | |||
| do { \ | |||
| mcError_t error = call; \ | |||
| if (error != mcSuccess) { \ | |||
| std::cerr << "MACA error at " << __FILE__ << ":" << __LINE__ \ | |||
| << " - " << mcGetErrorString(error) << std::endl; \ | |||
| exit(1); \ | |||
| } \ | |||
| } while(0) | |||
| // ============================================================================ | |||
| // 测试数据生成器 | |||
| // ============================================================================ | |||
| class TestDataGenerator { | |||
| private: | |||
| std::mt19937 rng; | |||
| public: | |||
| TestDataGenerator(uint32_t seed = 42) : rng(seed) {} | |||
| // 生成随机float数组 | |||
| std::vector<float> generateRandomFloats(int size, float min_val = -1000.0f, float max_val = 1000.0f) { | |||
| std::vector<float> data(size); | |||
| std::uniform_real_distribution<float> dist(min_val, max_val); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = dist(rng); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成随机half数组 | |||
| std::vector<half> generateRandomHalfs(int size, float min_val = -100.0f, float max_val = 100.0f) { | |||
| std::vector<half> data(size); | |||
| std::uniform_real_distribution<float> dist(min_val, max_val); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = __float2half(dist(rng)); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成随机uint32_t数组 | |||
| std::vector<uint32_t> generateRandomUint32(int size) { | |||
| std::vector<uint32_t> data(size); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = static_cast<uint32_t>(i); // 使用索引作为值,便于验证稳定排序 | |||
| } | |||
| return data; | |||
| } | |||
| // 生成随机int64_t数组 | |||
| std::vector<int64_t> generateRandomInt64(int size) { | |||
| std::vector<int64_t> data(size); | |||
| for (int i = 0; i < size; i++) { | |||
| data[i] = static_cast<int64_t>(i); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成包含NaN和Inf的测试数据 (half版本) | |||
| std::vector<half> generateSpecialHalfs(int size) { | |||
| std::vector<half> data = generateRandomHalfs(size, -10.0f, 10.0f); | |||
| if (size > 100) { | |||
| data[10] = __float2half(NAN); | |||
| data[20] = __float2half(INFINITY); | |||
| data[30] = __float2half(-INFINITY); | |||
| } | |||
| return data; | |||
| } | |||
| // 生成包含NaN和Inf的测试数据 (float版本) | |||
| std::vector<float> generateSpecialFloats(int size) { | |||
| std::vector<float> data = generateRandomFloats(size, -10.0f, 10.0f); | |||
| if (size > 100) { | |||
| data[10] = NAN; | |||
| data[20] = INFINITY; | |||
| data[30] = -INFINITY; | |||
| } | |||
| return data; | |||
| } | |||
| }; | |||
| // ============================================================================ | |||
| // 性能测试工具 | |||
| // ============================================================================ | |||
| class PerformanceMeter { | |||
| private: | |||
| mcEvent_t start, stop; | |||
| public: | |||
| PerformanceMeter() { | |||
| MACA_CHECK(mcEventCreate(&start)); | |||
| MACA_CHECK(mcEventCreate(&stop)); | |||
| } | |||
| ~PerformanceMeter() { | |||
| mcEventDestroy(start); | |||
| mcEventDestroy(stop); | |||
| } | |||
| void startTiming() { | |||
| MACA_CHECK(mcEventRecord(start)); | |||
| } | |||
| float stopTiming() { | |||
| MACA_CHECK(mcEventRecord(stop)); | |||
| MACA_CHECK(mcEventSynchronize(stop)); | |||
| float milliseconds = 0; | |||
| MACA_CHECK(mcEventElapsedTime(&milliseconds, start, stop)); | |||
| return milliseconds; | |||
| } | |||
| }; | |||
| // ============================================================================ | |||
| // 正确性验证工具 | |||
| // ============================================================================ | |||
| template<typename T> | |||
| bool compareArrays(const std::vector<T>& a, const std::vector<T>& b, double tolerance = 1e-6) { | |||
| if (a.size() != b.size()) return false; | |||
| for (size_t i = 0; i < a.size(); i++) { | |||
| if constexpr (std::is_same_v<T, half>) { | |||
| float fa = __half2float(a[i]); | |||
| float fb = __half2float(b[i]); | |||
| if (std::isnan(fa) && std::isnan(fb)) continue; | |||
| if (std::isinf(fa) && std::isinf(fb) && (fa > 0) == (fb > 0)) continue; | |||
| if (std::abs(fa - fb) > tolerance) return false; | |||
| } else if constexpr (std::is_floating_point_v<T>) { | |||
| if (std::isnan(a[i]) && std::isnan(b[i])) continue; | |||
| if (std::isinf(a[i]) && std::isinf(b[i]) && (a[i] > 0) == (b[i] > 0)) continue; | |||
| if (std::abs(a[i] - b[i]) > tolerance) return false; | |||
| } else { | |||
| if (a[i] != b[i]) return false; | |||
| } | |||
| } | |||
| return true; | |||
| } | |||
| // CPU参考实现 - 稳定排序 | |||
| template<typename KeyType, typename ValueType> | |||
| void cpuSortPair(std::vector<KeyType>& keys, std::vector<ValueType>& values, bool descending) { | |||
| std::vector<std::pair<KeyType, ValueType>> pairs; | |||
| for (size_t i = 0; i < keys.size(); i++) { | |||
| pairs.emplace_back(keys[i], values[i]); | |||
| } | |||
| if (descending) { | |||
| std::stable_sort(pairs.begin(), pairs.end(), | |||
| [](const auto& a, const auto& b) { return a.first > b.first; }); | |||
| } else { | |||
| std::stable_sort(pairs.begin(), pairs.end()); | |||
| } | |||
| for (size_t i = 0; i < pairs.size(); i++) { | |||
| keys[i] = pairs[i].first; | |||
| values[i] = pairs[i].second; | |||
| } | |||
| } | |||
| // CPU参考实现 - TopK | |||
| template<typename KeyType, typename ValueType> | |||
| void cpuTopkPair(const std::vector<KeyType>& keys_in, const std::vector<ValueType>& values_in, | |||
| std::vector<KeyType>& keys_out, std::vector<ValueType>& values_out, | |||
| int k, bool descending) { | |||
| std::vector<std::pair<KeyType, ValueType>> pairs; | |||
| for (size_t i = 0; i < keys_in.size(); i++) { | |||
| pairs.emplace_back(keys_in[i], values_in[i]); | |||
| } | |||
| if (descending) { | |||
| std::stable_sort(pairs.begin(), pairs.end(), | |||
| [](const auto& a, const auto& b) { return a.first > b.first; }); | |||
| } else { | |||
| std::stable_sort(pairs.begin(), pairs.end()); | |||
| } | |||
| keys_out.resize(k); | |||
| values_out.resize(k); | |||
| for (int i = 0; i < k; i++) { | |||
| keys_out[i] = pairs[i].first; | |||
| values_out[i] = pairs[i].second; | |||
| } | |||
| } | |||
| // CPU参考实现 - ReduceSum (使用double精度) | |||
| template<typename InputT> | |||
| double cpuReduceSum(const std::vector<InputT>& data, double init_value) { | |||
| double sum = init_value; | |||
| for (const auto& val : data) { | |||
| if constexpr (std::is_same_v<InputT, half>) { | |||
| float f_val = __half2float(val); | |||
| if (!std::isnan(f_val)) { | |||
| sum += static_cast<double>(f_val); | |||
| } | |||
| } else { | |||
| if (!std::isnan(val)) { | |||
| sum += static_cast<double>(val); | |||
| } | |||
| } | |||
| } | |||
| return sum; | |||
| } | |||
| @@ -0,0 +1,154 @@ | |||
| #pragma once | |||
| #include <fstream> | |||
| #include <vector> | |||
| #include <map> | |||
| #include <string> | |||
| #include <chrono> | |||
| #include <iomanip> | |||
| #include <sstream> | |||
| // ============================================================================ | |||
| // YAML性能报告生成器 | |||
| // ============================================================================ | |||
| class YAMLPerformanceReporter { | |||
| public: | |||
| struct PerformanceData { | |||
| std::string algorithm; | |||
| std::string input_type; | |||
| std::string output_type; | |||
| std::string key_type; | |||
| std::string value_type; | |||
| std::vector<std::map<std::string, std::string>> metrics; | |||
| }; | |||
| // 创建性能数据条目 | |||
| static std::map<std::string, std::string> createEntry() { | |||
| return std::map<std::string, std::string>(); | |||
| } | |||
| // 生成ReduceSum性能YAML | |||
| static void generateReduceSumYAML(const std::vector<std::map<std::string, std::string>>& perf_data, | |||
| const std::string& filename = "reduce_sum_performance.yaml") { | |||
| std::ofstream yaml_file(filename); | |||
| // 写入头部信息 | |||
| writeHeader(yaml_file, "ReduceSum算法性能测试结果"); | |||
| // 算法信息 | |||
| yaml_file << "algorithm: \"ReduceSum\"\n"; | |||
| yaml_file << "data_types:\n"; | |||
| yaml_file << " input: \"float\"\n"; | |||
| yaml_file << " output: \"float\"\n"; | |||
| // 计算公式 | |||
| yaml_file << "formulas:\n"; | |||
| yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; | |||
| // 性能数据 | |||
| yaml_file << "performance_data:\n"; | |||
| for (const auto& data : perf_data) { | |||
| yaml_file << " - data_size: " << data.at("data_size") << "\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("throughput_gps")) << "\n"; | |||
| yaml_file << " data_type: \"" << data.at("data_type") << "\"\n"; | |||
| } | |||
| yaml_file.close(); | |||
| } | |||
| // 生成SortPair性能YAML | |||
| static void generateSortPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data, | |||
| const std::string& filename = "sort_pair_performance.yaml") { | |||
| std::ofstream yaml_file(filename); | |||
| // 写入头部信息 | |||
| writeHeader(yaml_file, "SortPair算法性能测试结果"); | |||
| // 算法信息 | |||
| yaml_file << "algorithm: \"SortPair\"\n"; | |||
| yaml_file << "data_types:\n"; | |||
| yaml_file << " key_type: \"float\"\n"; | |||
| yaml_file << " value_type: \"uint32_t\"\n"; | |||
| // 计算公式 | |||
| yaml_file << "formulas:\n"; | |||
| yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; | |||
| // 性能数据 | |||
| yaml_file << "performance_data:\n"; | |||
| for (const auto& data : perf_data) { | |||
| yaml_file << " - data_size: " << data.at("data_size") << "\n"; | |||
| yaml_file << " ascending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n"; | |||
| yaml_file << " descending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n"; | |||
| yaml_file << " key_type: \"" << data.at("key_type") << "\"\n"; | |||
| yaml_file << " value_type: \"" << data.at("value_type") << "\"\n"; | |||
| } | |||
| yaml_file.close(); | |||
| } | |||
| // 生成TopkPair性能YAML | |||
| static void generateTopkPairYAML(const std::vector<std::map<std::string, std::string>>& perf_data, | |||
| const std::string& filename = "topk_pair_performance.yaml") { | |||
| std::ofstream yaml_file(filename); | |||
| // 写入头部信息 | |||
| writeHeader(yaml_file, "TopkPair算法性能测试结果"); | |||
| // 算法信息 | |||
| yaml_file << "algorithm: \"TopkPair\"\n"; | |||
| yaml_file << "data_types:\n"; | |||
| yaml_file << " key_type: \"float\"\n"; | |||
| yaml_file << " value_type: \"uint32_t\"\n"; | |||
| // 计算公式 | |||
| yaml_file << "formulas:\n"; | |||
| yaml_file << " throughput: \"elements / time(s) / 1e9 (G/s)\"\n"; | |||
| // 性能数据 | |||
| yaml_file << "performance_data:\n"; | |||
| for (const auto& data : perf_data) { | |||
| yaml_file << " - data_size: " << data.at("data_size") << "\n"; | |||
| yaml_file << " k_value: " << data.at("k_value") << "\n"; | |||
| yaml_file << " ascending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("asc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("asc_throughput_gps")) << "\n"; | |||
| yaml_file << " descending:\n"; | |||
| yaml_file << " time_ms: " << formatFloat(data.at("desc_time_ms")) << "\n"; | |||
| yaml_file << " throughput_gps: " << formatFloat(data.at("desc_throughput_gps")) << "\n"; | |||
| yaml_file << " key_type: \"" << data.at("key_type") << "\"\n"; | |||
| yaml_file << " value_type: \"" << data.at("value_type") << "\"\n"; | |||
| } | |||
| yaml_file.close(); | |||
| } | |||
| private: | |||
| // 写入YAML文件头部 | |||
| static void writeHeader(std::ofstream& file, const std::string& title) { | |||
| file << "# " << title << "\n"; | |||
| file << "# 生成时间: "; | |||
| auto now = std::chrono::system_clock::now(); | |||
| auto time_t = std::chrono::system_clock::to_time_t(now); | |||
| file << std::put_time(std::localtime(&time_t), "%Y-%m-%d %H:%M:%S"); | |||
| file << "\n\n"; | |||
| } | |||
| // 格式化浮点数 | |||
| static std::string formatFloat(const std::string& value) { | |||
| try { | |||
| double d = std::stod(value); | |||
| std::ostringstream oss; | |||
| oss << std::fixed << std::setprecision(6) << d; | |||
| return oss.str(); | |||
| } catch (...) { | |||
| return value; | |||
| } | |||
| } | |||
| }; | |||