| @@ -381,9 +381,6 @@ checkopts "$@" | |||||
| echo "---------------- MindSpore: build start ----------------" | echo "---------------- MindSpore: build start ----------------" | ||||
| mkdir -pv "${BUILD_PATH}/package/mindspore/lib" | mkdir -pv "${BUILD_PATH}/package/mindspore/lib" | ||||
| git submodule update --init graphengine | git submodule update --init graphengine | ||||
| cd "${BASEPATH}/graphengine" | |||||
| git submodule update --init metadef | |||||
| cd "${BASEPATH}" | |||||
| if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then | if [[ "X$ENABLE_AKG" = "Xon" ]] && [[ "X$ENABLE_D" = "Xon" || "X$ENABLE_GPU" = "Xon" ]]; then | ||||
| git submodule update --init --recursive akg | git submodule update --init --recursive akg | ||||
| fi | fi | ||||
| @@ -38,7 +38,6 @@ elseif (DEFINED ENV{D_LINK_PATH}) | |||||
| find_library(slog libslog.so ${GE_LIB_PATH}) | find_library(slog libslog.so ${GE_LIB_PATH}) | ||||
| find_library(mmpa libmmpa.a ${GE_LIB_PATH}) | find_library(mmpa libmmpa.a ${GE_LIB_PATH}) | ||||
| find_library(runtime libruntime.so ${GE_LIB_PATH}) | find_library(runtime libruntime.so ${GE_LIB_PATH}) | ||||
| find_library(msprof libmsprof.so ${GE_LIB_PATH}) | |||||
| find_library(register libregister.so ${GE_LIB_PATH}) | find_library(register libregister.so ${GE_LIB_PATH}) | ||||
| find_library(hccl libhccl.so ${GE_LIB_PATH}) | find_library(hccl libhccl.so ${GE_LIB_PATH}) | ||||
| find_library(cce libcce.so ${GE_LIB_PATH}) | find_library(cce libcce.so ${GE_LIB_PATH}) | ||||
| @@ -59,7 +58,6 @@ else() | |||||
| find_library(cce libcce.so ${ASCEND_RUNTIME_PATH}) | find_library(cce libcce.so ${ASCEND_RUNTIME_PATH}) | ||||
| find_library(hccl libhccl.so ${ASCEND_RUNTIME_PATH}) | find_library(hccl libhccl.so ${ASCEND_RUNTIME_PATH}) | ||||
| find_library(runtime libruntime.so ${ASCEND_RUNTIME_PATH}) | find_library(runtime libruntime.so ${ASCEND_RUNTIME_PATH}) | ||||
| find_library(msprof libmsprof.so ${ASCEND_RUNTIME_PATH}) | |||||
| find_library(register libregister.so ${ASCEND_RUNTIME_PATH}) | find_library(register libregister.so ${ASCEND_RUNTIME_PATH}) | ||||
| find_library(resource libresource.so ${ASCEND_RUNTIME_PATH}) | find_library(resource libresource.so ${ASCEND_RUNTIME_PATH}) | ||||
| find_library(error_manager liberror_manager.so ${ASCEND_RUNTIME_PATH}) | find_library(error_manager liberror_manager.so ${ASCEND_RUNTIME_PATH}) | ||||
| @@ -68,7 +66,6 @@ else() | |||||
| find_library(cce libcce.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(cce libcce.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(hccl libhccl.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(hccl libhccl.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(runtime libruntime.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(runtime libruntime.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(msprof libmsprof.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||||
| find_library(register libregister.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(register libregister.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(resource libresource.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(resource libresource.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(error_manager liberror_manager.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(error_manager liberror_manager.so ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| @@ -26,7 +26,7 @@ if (ENABLE_D OR ENABLE_ACL OR ENABLE_TESTCASES) | |||||
| # use slog, error manager, mmpa in non ascend mode, e.g. tests | # use slog, error manager, mmpa in non ascend mode, e.g. tests | ||||
| set(GE_PREBUILD_PATH ${GE_SOURCE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR}) | set(GE_PREBUILD_PATH ${GE_SOURCE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR}) | ||||
| set(ENABLE_MS_TESTCASES TRUE) | set(ENABLE_MS_TESTCASES TRUE) | ||||
| find_submodule_lib(slog libslog.so ${GE_PREBUILD_PATH}) | |||||
| find_submodule_lib(slog libalog.so ${GE_PREBUILD_PATH}) | |||||
| find_submodule_lib(error_manager liberror_manager.so ${GE_PREBUILD_PATH}) | find_submodule_lib(error_manager liberror_manager.so ${GE_PREBUILD_PATH}) | ||||
| find_submodule_lib(static_mmpa libmmpa.a ${GE_PREBUILD_PATH}) | find_submodule_lib(static_mmpa libmmpa.a ${GE_PREBUILD_PATH}) | ||||
| endif() | endif() | ||||
| @@ -1 +1 @@ | |||||
| Subproject commit 20a0326976db65ca01f43ae4ccdd85677faaeb5e | |||||
| Subproject commit 9a7b271674f343157c316b1455aee628c43cffdc | |||||
| @@ -122,7 +122,7 @@ class AscendEnvChecker(EnvChecker): | |||||
| """ascend environment check""" | """ascend environment check""" | ||||
| def __init__(self): | def __init__(self): | ||||
| self.version = ["1.76.T21.0.B210"] | |||||
| self.version = ["1.76.22.0.220"] | |||||
| atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info" | atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info" | ||||
| atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info" | atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info" | ||||
| hisi_fwk_version = "/usr/local/Ascend/fwkacllib/version.info" | hisi_fwk_version = "/usr/local/Ascend/fwkacllib/version.info" | ||||
| @@ -248,17 +248,17 @@ if (ENABLE_D) | |||||
| find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(RUNTIME_LIB runtime ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | find_library(TSDCLIENT tsdclient HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | ||||
| find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | find_library(DATATRANSFER datatransfer HINTS ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH} ${ASCEND_DRIVER_BACK_PATH}) | ||||
| find_library(PROFILING msprofiler ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||||
| find_library(PROFILING_SHARED msprof ${ASCEND_DRIVER_PATH}) | |||||
| find_library(PROFILING msprofiler_fwk ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | |||||
| find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(REGISTER register ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(OPTILING optiling ${ASCEND_OPP_PATH}) | find_library(OPTILING optiling ${ASCEND_OPP_PATH}) | ||||
| # hccl_adpter | # hccl_adpter | ||||
| find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | find_library(HCCL_ADPTER hcom_graph_adaptor ${ASCEND_RUNTIME_PATH} ${ASCEND_TOOLKIT_RUNTIME_PATH}) | ||||
| find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) | find_library(HCCL_BUILDER hcom_opskernel_builder ${ASCEND_RUNTIME_PATH}/plugin/opskernel ${ASCEND_TOOLKIT_RUNTIME_PATH}/plugin/opskernel) | ||||
| add_library(ms_profile SHARED ${PROFILING}) | |||||
| add_library(ms_profile SHARED ${CMAKE_CURRENT_SOURCE_DIR}/runtime/device/ascend/profiling/profiling_callback_register.cc) | |||||
| set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) | set_target_properties(ms_profile PROPERTIES LINKER_LANGUAGE CXX) | ||||
| target_link_libraries(ms_profile -Wl,--start-group ${PROFILING_SHARED} ${PROFILING} mindspore::protobuf -Wl,--end-group) | |||||
| target_link_options(ms_profile PRIVATE -Wl,-init,common_log_init) | |||||
| target_link_libraries(ms_profile -Wl,--start-group -Wl,--whole-archive ${PROFILING} -Wl,--no-whole-archive mindspore::protobuf -Wl,--end-group) | |||||
| target_link_libraries(mindspore ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} | target_link_libraries(mindspore ge_runtime ${CCE_LIB} ${RUNTIME_LIB} ${TSDCLIENT} ${HCCL} ${DATATRANSFER} | ||||
| ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} ${HCCL_BUILDER}) | ${HCCL_ADPTER} ${REGISTER} -Wl,--no-as-needed ${OPTILING} ${HCCL_BUILDER}) | ||||
| target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) | target_link_libraries(mindspore -Wl,--start-group proto_input mindspore::protobuf -Wl,--end-group) | ||||
| @@ -422,12 +422,7 @@ GraphInfo GetSingleOpGraphInfo(const PrimitivePtr &prim, const std::vector<tenso | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| void AscendSession::Init(uint32_t device_id) { | |||||
| InitExecutor(kAscendDevice, device_id); | |||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); | |||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||||
| runtime_instance->CreateContext(); | |||||
| } | |||||
| void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); } | |||||
| void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) { | void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) { | ||||
| auto context_ptr = MsContext::GetInstance(); | auto context_ptr = MsContext::GetInstance(); | ||||
| @@ -1019,7 +1019,6 @@ void InitHccl() { | |||||
| mindspore::parse::python_adapter::set_python_env_flag(true); | mindspore::parse::python_adapter::set_python_env_flag(true); | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(ms_context); | MS_EXCEPTION_IF_NULL(ms_context); | ||||
| (void)context::OpenTsd(ms_context); | |||||
| uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | ||||
| std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | ||||
| ms_context->set_param<bool>(MS_CTX_ENABLE_HCCL, true); | ms_context->set_param<bool>(MS_CTX_ENABLE_HCCL, true); | ||||
| @@ -1027,10 +1026,14 @@ void InitHccl() { | |||||
| ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) { | ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) { | ||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); | auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); | ||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | MS_EXCEPTION_IF_NULL(runtime_instance); | ||||
| runtime_instance->PreInit(); | |||||
| (void)context::OpenTsd(ms_context); | |||||
| if (!runtime_instance->Init()) { | if (!runtime_instance->Init()) { | ||||
| MS_LOG(ERROR) << "Kernel runtime init error."; | MS_LOG(ERROR) << "Kernel runtime init error."; | ||||
| return; | return; | ||||
| } | } | ||||
| } else { | |||||
| (void)context::OpenTsd(ms_context); | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -1060,9 +1063,29 @@ void ReleaseGeTsd() { | |||||
| } | } | ||||
| } | } | ||||
| void StartUpProfiling() { | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| if (!ms_context->get_param<bool>(MS_CTX_ENABLE_PROFILING)) { | |||||
| return; | |||||
| } | |||||
| MS_LOG(INFO) << "Startup profiling"; | |||||
| // Start up profiling before OpenTsd | |||||
| uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||||
| std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||||
| if (ms_context->backend_policy() == "ms" && | |||||
| ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) { | |||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); | |||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||||
| runtime_instance->PreInit(); | |||||
| } | |||||
| } | |||||
| void InitBackend() { | void InitBackend() { | ||||
| // set python env flag | // set python env flag | ||||
| mindspore::parse::python_adapter::set_python_env_flag(true); | mindspore::parse::python_adapter::set_python_env_flag(true); | ||||
| // Startup profiling before open tsd | |||||
| StartUpProfiling(); | |||||
| // open tsd before ge initialize | // open tsd before ge initialize | ||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(ms_context); | MS_EXCEPTION_IF_NULL(ms_context); | ||||
| @@ -64,6 +64,7 @@ if (ENABLE_GPU) | |||||
| # add_library(_mindspore_device_cuda_obj OBJECT ${CUDA_SRC_LIST}) | # add_library(_mindspore_device_cuda_obj OBJECT ${CUDA_SRC_LIST}) | ||||
| endif () | endif () | ||||
| list(REMOVE_ITEM D_SRC_LIST "ascend/profiling/profiling_callback_register.cc") | |||||
| set_property(SOURCE ${DEVICE_SRC_LIST} ${D_SRC_LIST} ${CPU_SRC_LIST} | set_property(SOURCE ${DEVICE_SRC_LIST} ${D_SRC_LIST} ${CPU_SRC_LIST} | ||||
| PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) | PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) | ||||
| add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST} ${D_SRC_LIST} ${CPU_SRC_LIST}) | add_library(_mindspore_runtime_device_obj OBJECT ${DEVICE_SRC_LIST} ${D_SRC_LIST} ${CPU_SRC_LIST}) | ||||
| @@ -45,6 +45,8 @@ | |||||
| #include "toolchain/adx_datadump_server.h" | #include "toolchain/adx_datadump_server.h" | ||||
| #include "utils/shape_utils.h" | #include "utils/shape_utils.h" | ||||
| #include "utils/trace_base.h" | #include "utils/trace_base.h" | ||||
| #include "graphengine/inc/external/acl/error_codes/rt_error_codes.h" | |||||
| #include "debug/anf_ir_dump.h" | |||||
| #ifdef MEM_REUSE_DEBUG | #ifdef MEM_REUSE_DEBUG | ||||
| #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" | #include "backend/optimizer/mem_reuse/mem_reuse_checker.h" | ||||
| #endif | #endif | ||||
| @@ -54,6 +56,7 @@ | |||||
| #include "utils/config_manager.h" | #include "utils/config_manager.h" | ||||
| #include "runtime/device/ascend/profiling/reporter/op_name_task_stream_reporter.h" | #include "runtime/device/ascend/profiling/reporter/op_name_task_stream_reporter.h" | ||||
| #include "runtime/hccl_adapter/hccl_adapter.h" | #include "runtime/hccl_adapter/hccl_adapter.h" | ||||
| #include "runtime/device/ascend/profiling/profiling_callback_register.h" | |||||
| #include "backend/kernel_compiler/hccl/hccl_context.h" | #include "backend/kernel_compiler/hccl/hccl_context.h" | ||||
| #ifdef ENABLE_TDTQUE | #ifdef ENABLE_TDTQUE | ||||
| #include "tdt/tdt_host_interface.h" | #include "tdt/tdt_host_interface.h" | ||||
| @@ -71,11 +74,9 @@ constexpr uint32_t kTupleTaskId = 0; | |||||
| constexpr uint32_t kTupleStreamId = 1; | constexpr uint32_t kTupleStreamId = 1; | ||||
| constexpr uint32_t kTupleArgs = 2; | constexpr uint32_t kTupleArgs = 2; | ||||
| constexpr uint32_t kProfilingMaxTaskIdInStream = 65531; | constexpr uint32_t kProfilingMaxTaskIdInStream = 65531; | ||||
| constexpr auto kModuleName = "MindSpore"; | |||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| static const size_t PRAMATER_OUTPUT_INDEX = 0; | |||||
| namespace mindspore::device::ascend { | |||||
| static thread_local rtContext_t thread_local_rt_context{nullptr}; | static thread_local rtContext_t thread_local_rt_context{nullptr}; | ||||
| namespace { | namespace { | ||||
| std::string GetRankId() { | std::string GetRankId() { | ||||
| @@ -110,7 +111,9 @@ std::string GetRankId() { | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| std::vector<rtExceptionInfo> AscendKernelRuntime::exception_infoes_; | |||||
| std::vector<rtTaskFailInfo> AscendKernelRuntime::task_fail_infoes_ = {}; | |||||
| uint32_t AscendKernelRuntime::current_graph_id_ = 0; | |||||
| std::map<std::string, uint32_t> AscendKernelRuntime::overflow_tasks_; | |||||
| AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } | AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); } | ||||
| void AscendKernelRuntime::SetContext() { | void AscendKernelRuntime::SetContext() { | ||||
| @@ -255,6 +258,11 @@ void AscendKernelRuntime::ReleaseDeviceRes() { | |||||
| mem_manager_->FreeDeviceMemory(); | mem_manager_->FreeDeviceMemory(); | ||||
| } | } | ||||
| auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, nullptr); | |||||
| if (rt_ret != RT_ERROR_NONE) { | |||||
| MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret; | |||||
| } | |||||
| (void)DestroySingleOpHccl(); | (void)DestroySingleOpHccl(); | ||||
| (void)DestroyHccl(); | (void)DestroyHccl(); | ||||
| (void)ResetDevice(device_id); | (void)ResetDevice(device_id); | ||||
| @@ -262,6 +270,13 @@ void AscendKernelRuntime::ReleaseDeviceRes() { | |||||
| MS_LOG(INFO) << "Ascend finalize end"; | MS_LOG(INFO) << "Ascend finalize end"; | ||||
| } | } | ||||
| void AscendKernelRuntime::PreInit() { | |||||
| auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); | |||||
| if (!ret) { | |||||
| MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; | |||||
| } | |||||
| } | |||||
| bool AscendKernelRuntime::Init() { | bool AscendKernelRuntime::Init() { | ||||
| if (initialized_) { | if (initialized_) { | ||||
| InnerSetContext(); | InnerSetContext(); | ||||
| @@ -269,24 +284,21 @@ bool AscendKernelRuntime::Init() { | |||||
| } | } | ||||
| OpTilingCalculater::GetInstance().Init(); | OpTilingCalculater::GetInstance().Init(); | ||||
| // Start up profiling before rtSetDevice | // Start up profiling before rtSetDevice | ||||
| bool ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); | |||||
| if (!ret) { | |||||
| MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; | |||||
| } | |||||
| ret = InitDevice(); | |||||
| bool ret = InitDevice(); | |||||
| if (!ret) { | if (!ret) { | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| SetDebugger(); | SetDebugger(); | ||||
| mem_manager_ = std::make_shared<AscendMemoryManager>(); | mem_manager_ = std::make_shared<AscendMemoryManager>(); | ||||
| MS_EXCEPTION_IF_NULL(mem_manager_); | MS_EXCEPTION_IF_NULL(mem_manager_); | ||||
| mem_manager_->MallocDeviceMemory(); | mem_manager_->MallocDeviceMemory(); | ||||
| // Set callback func when exception error | // Set callback func when exception error | ||||
| auto rt_ret = rtSetTaskFailCallback(ExceptionCallback); | |||||
| auto rt_ret = rtRegTaskFailCallbackByModule(kModuleName, TaskFailCallback); | |||||
| if (rt_ret != RT_ERROR_NONE) { | if (rt_ret != RT_ERROR_NONE) { | ||||
| MS_LOG(EXCEPTION) << "SetTaskFailCallback failed, error: " << rt_ret; | |||||
| MS_LOG(EXCEPTION) << "Reg SetTaskFailCallback failed, error: " << rt_ret; | |||||
| } | } | ||||
| initialized_ = true; | initialized_ = true; | ||||
| @@ -525,42 +537,57 @@ void AscendKernelRuntime::LaunchDataDump(GraphId graph_id) { | |||||
| } | } | ||||
| } | } | ||||
| void AscendKernelRuntime::ExceptionCallback(rtExceptionInfo *exception_info) { | |||||
| void AscendKernelRuntime::TaskFailCallback(rtTaskFailInfo *task_fail_info) { | |||||
| MS_EXCEPTION_IF_NULL(task_fail_info); | |||||
| static std::mutex exception_mutex; | static std::mutex exception_mutex; | ||||
| std::lock_guard<std::mutex> lock(exception_mutex); | std::lock_guard<std::mutex> lock(exception_mutex); | ||||
| exception_infoes_.push_back(*exception_info); | |||||
| if (task_fail_info->retcode == ACL_ERROR_RT_AICORE_OVER_FLOW) { | |||||
| auto key = std::to_string(task_fail_info->streamid) + std::to_string(task_fail_info->taskid); | |||||
| auto find_iter = overflow_tasks_.find(key); | |||||
| if (find_iter == overflow_tasks_.end()) { | |||||
| overflow_tasks_[key] = 1; | |||||
| } else { | |||||
| if (overflow_tasks_[key] == 5) { | |||||
| auto node_name = AscendKernelRuntime::GetErrorNodeName(task_fail_info->streamid, task_fail_info->taskid); | |||||
| MS_LOG(WARNING) << "Node run task overflow, node name: " << node_name; | |||||
| overflow_tasks_.erase(find_iter); | |||||
| } else { | |||||
| overflow_tasks_[key]++; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| MS_LOG(WARNING) << "Task fail infos task_id: " << task_fail_info->taskid | |||||
| << ", stream_id: " << task_fail_info->streamid << ", tid: " << task_fail_info->tid | |||||
| << ", device_id: " << task_fail_info->deviceid << ", retcode: " << task_fail_info->retcode; | |||||
| task_fail_infoes_.push_back(*task_fail_info); | |||||
| } | |||||
| } | } | ||||
| void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| std::vector<std::string> full_scope_name{}; | |||||
| // Find node name(full scope name) | |||||
| auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(graph->graph_id()); | |||||
| MS_LOG(ERROR) << "Exception_infos_ size: " << exception_infoes_.size() << ". first example: " | |||||
| << ", task_id: " << exception_infoes_.at(0).taskid | |||||
| << ", stream_id: " << exception_infoes_.at(0).streamid << ", tid: " << exception_infoes_.at(0).tid | |||||
| << ", device_id: " << exception_infoes_.at(0).deviceid; | |||||
| for (const auto &exception_info : exception_infoes_) { | |||||
| for (const auto &iter : runtime_info_map) { | |||||
| auto task_id = std::get<kTupleTaskId>(*iter.second); | |||||
| auto stream_id = std::get<kTupleStreamId>(*iter.second); | |||||
| if (task_id == exception_info.taskid && stream_id == exception_info.streamid) { | |||||
| full_scope_name.push_back(iter.first); | |||||
| MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; | |||||
| } | |||||
| string AscendKernelRuntime::GetErrorNodeName(uint32_t streamid, uint32_t taskid) { | |||||
| auto runtime_info_map = ModelRunner::Instance().GetRuntimeInfoMap(AscendKernelRuntime::current_graph_id_); | |||||
| for (const auto &iter : runtime_info_map) { | |||||
| auto task_id = std::get<kTupleTaskId>(*iter.second); | |||||
| auto stream_id = std::get<kTupleStreamId>(*iter.second); | |||||
| if (task_id == taskid && stream_id == streamid) { | |||||
| MS_LOG(ERROR) << "Node: " << iter.first << ", run task error."; | |||||
| return iter.first; | |||||
| } | } | ||||
| } | } | ||||
| return ""; | |||||
| } | |||||
| void AscendKernelRuntime::DumpTaskExceptionInfo(const session::KernelGraph *graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| auto full_scope_name = | |||||
| AscendKernelRuntime::GetErrorNodeName(task_fail_infoes_.at(0).streamid, task_fail_infoes_.at(0).taskid); | |||||
| // Dump error data in local path | // Dump error data in local path | ||||
| const std::string local_path = std::string("./task_error_dump/") + std::to_string(exception_infoes_.at(0).deviceid); | |||||
| const std::string local_path = std::string("./task_error_dump/") + std::to_string(task_fail_infoes_.at(0).deviceid); | |||||
| for (const auto &node : graph->execution_order()) { | for (const auto &node : graph->execution_order()) { | ||||
| for (auto &name : full_scope_name) { | |||||
| if (node->fullname_with_scope() == name) { | |||||
| MS_LOG(ERROR) << "Begin to dump node (" << name << ") task error input/output data in local path." | |||||
| << " trace: " << trace::DumpSourceLines(node); | |||||
| E2eDumpUtil::DumpInputImpl(node, false, local_path, &name, nullptr); | |||||
| E2eDumpUtil::DumpOutputImpl(node, false, local_path, &name, nullptr); | |||||
| } | |||||
| if (node->fullname_with_scope() == full_scope_name) { | |||||
| MS_LOG(ERROR) << "Begin to dump node (" << full_scope_name << ") task error input/output data in local path." | |||||
| << " trace: " << trace::DumpSourceLines(node); | |||||
| E2eDumpUtil::DumpInputImpl(node, false, local_path, &full_scope_name, nullptr); | |||||
| E2eDumpUtil::DumpOutputImpl(node, false, local_path, &full_scope_name, nullptr); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -571,7 +598,8 @@ bool AscendKernelRuntime::Run(session::KernelGraph *graph, bool is_task_sink) { | |||||
| #if defined(_WIN32) || defined(_WIN64) | #if defined(_WIN32) || defined(_WIN64) | ||||
| auto start_time = std::chrono::steady_clock::now(); | auto start_time = std::chrono::steady_clock::now(); | ||||
| #else | #else | ||||
| struct timeval start_time, end_time; | |||||
| struct timeval start_time {}; | |||||
| struct timeval end_time {}; | |||||
| (void)gettimeofday(&start_time, nullptr); | (void)gettimeofday(&start_time, nullptr); | ||||
| #endif | #endif | ||||
| if (is_task_sink) { | if (is_task_sink) { | ||||
| @@ -630,6 +658,7 @@ bool AscendKernelRuntime::RunDynamicKernelAsync(const session::KernelGraph *grap | |||||
| } | } | ||||
| bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { | bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { | ||||
| current_graph_id_ = graph->graph_id(); | |||||
| InnerSetContext(); | InnerSetContext(); | ||||
| MS_EXCEPTION_IF_NULL(graph); | MS_EXCEPTION_IF_NULL(graph); | ||||
| if (graph->is_dynamic_shape()) { | if (graph->is_dynamic_shape()) { | ||||
| @@ -656,7 +685,8 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { | |||||
| bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); | bool status = ModelRunner::Instance().RunModel(graph->graph_id(), input_tensors, output_tensors); | ||||
| if (!status) { | if (!status) { | ||||
| DumpTaskExceptionInfo(graph); | DumpTaskExceptionInfo(graph); | ||||
| std::string file_name = "task_error_debug" + std::to_string(current_graph_id_) + ".ir"; | |||||
| DumpIR(file_name, std::shared_ptr<session::KernelGraph>(const_cast<session::KernelGraph *>(graph))); | |||||
| #ifdef ENABLE_TDTQUE | #ifdef ENABLE_TDTQUE | ||||
| // Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung | // Run task error, we should call TdtHostDestroy to release tdt to avoid DeviceQueueOp hostPush hung | ||||
| // case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend | // case1: cpu usage 100% cause thread/process exit, but some tdt thread remain in backend | ||||
| @@ -667,10 +697,9 @@ bool AscendKernelRuntime::RunTask(const session::KernelGraph *graph) { | |||||
| MS_LOG(INFO) << "Destroy tsd success."; | MS_LOG(INFO) << "Destroy tsd success."; | ||||
| } | } | ||||
| #endif | #endif | ||||
| return false; | return false; | ||||
| } | } | ||||
| exception_infoes_.clear(); | |||||
| task_fail_infoes_.clear(); | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -857,6 +886,4 @@ void AscendKernelRuntime::KernelLaunchProfiling(const std::string &kernel_name) | |||||
| MS_LOG(EXCEPTION) << "Too many profiling data"; | MS_LOG(EXCEPTION) << "Too many profiling data"; | ||||
| } | } | ||||
| } | } | ||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| } // namespace mindspore::device::ascend | |||||
| @@ -32,9 +32,7 @@ | |||||
| using ge::model_runner::TaskInfo; | using ge::model_runner::TaskInfo; | ||||
| using std::unordered_map; | using std::unordered_map; | ||||
| using std::vector; | using std::vector; | ||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| namespace mindspore::device::ascend { | |||||
| class AscendKernelRuntime : public KernelRuntime { | class AscendKernelRuntime : public KernelRuntime { | ||||
| public: | public: | ||||
| AscendKernelRuntime() = default; | AscendKernelRuntime() = default; | ||||
| @@ -56,6 +54,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| void SetContext() override; | void SetContext() override; | ||||
| void CreateContext() override; | void CreateContext() override; | ||||
| void *context() const override { return rt_context_; } | void *context() const override { return rt_context_; } | ||||
| void PreInit() override; | |||||
| protected: | protected: | ||||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| @@ -80,8 +79,9 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| bool CheckGraphIdValid(GraphId graph_id) const; | bool CheckGraphIdValid(GraphId graph_id) const; | ||||
| void DistributeDebugTask(NotNull<const session::KernelGraph *> graph, NotNull<std::function<void *()>> model_handle); | void DistributeDebugTask(NotNull<const session::KernelGraph *> graph, NotNull<std::function<void *()>> model_handle); | ||||
| void LaunchDataDump(GraphId graph_id); | void LaunchDataDump(GraphId graph_id); | ||||
| static string GetErrorNodeName(uint32_t streamid, uint32_t taskid); | |||||
| static void DumpTaskExceptionInfo(const session::KernelGraph *graph); | static void DumpTaskExceptionInfo(const session::KernelGraph *graph); | ||||
| static void ExceptionCallback(rtExceptionInfo *exception_info); | |||||
| static void TaskFailCallback(rtTaskFailInfo *task_fail_info); | |||||
| void ReportProfilingData(); | void ReportProfilingData(); | ||||
| rtContext_t rt_context_{nullptr}; | rtContext_t rt_context_{nullptr}; | ||||
| @@ -90,11 +90,11 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_; | unordered_map<GraphId, std::shared_ptr<ge::model_runner::DavinciModel>> graph_model_map_; | ||||
| unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_; | unordered_map<GraphId, std::shared_ptr<DataDumper>> graph_data_dumper_; | ||||
| std::map<std::pair<uint32_t, uint32_t>, std::string> stream_id_task_id_op_name_map_; | std::map<std::pair<uint32_t, uint32_t>, std::string> stream_id_task_id_op_name_map_; | ||||
| static std::vector<rtExceptionInfo> exception_infoes_; | |||||
| static uint32_t current_graph_id_; | |||||
| static std::map<std::string, uint32_t> overflow_tasks_; | |||||
| static std::vector<rtTaskFailInfo> task_fail_infoes_; | |||||
| }; | }; | ||||
| MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); | MS_REG_KERNEL_RUNTIME(kAscendDevice, AscendKernelRuntime); | ||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| } // namespace mindspore::device::ascend | |||||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_KERNEL_RUNTIME_H_ | #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_ASCEND_KERNEL_RUNTIME_H_ | ||||
| @@ -31,7 +31,7 @@ void AscendMemoryManager::MallocDeviceMemory() { | |||||
| device_mem_size_ = context_mem == 0 ? kAscendDeviceMemSize : context_mem; | device_mem_size_ = context_mem == 0 ? kAscendDeviceMemSize : context_mem; | ||||
| auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), device_mem_size_, RT_MEMORY_HBM); | auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), device_mem_size_, RT_MEMORY_HBM); | ||||
| if (ret != ACL_RT_SUCCESS) { | if (ret != ACL_RT_SUCCESS) { | ||||
| if (ret == ACL_ERROR_RT_DRV_INTERNEL_ERROR) { | |||||
| if (ret == ACL_ERROR_RT_DRV_INTERNAL_ERROR) { | |||||
| auto context_ptr = MsContext::GetInstance(); | auto context_ptr = MsContext::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(context_ptr); | MS_EXCEPTION_IF_NULL(context_ptr); | ||||
| unsigned int device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID); | unsigned int device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID); | ||||
| @@ -1,42 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "runtime/device/ascend/profiling/plugin_impl.h" | |||||
| #include <string> | |||||
| #include "utils/log_adapter.h" | |||||
| using std::string; | |||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| Reporter *PluginImpl::reporter_ = nullptr; | |||||
| PluginImpl::PluginImpl(const std::string &module) : module_(module) { MS_LOG(INFO) << "Create PluginImpl."; } | |||||
| int PluginImpl::Init(const Reporter *reporter) { | |||||
| MS_LOG(INFO) << "PluginImpl init"; | |||||
| MS_EXCEPTION_IF_NULL(reporter); | |||||
| reporter_ = const_cast<Reporter *>(reporter); | |||||
| return 0; | |||||
| } | |||||
| int PluginImpl::UnInit() { | |||||
| MS_LOG(INFO) << " PluginImpl Uninit "; | |||||
| reporter_ = nullptr; | |||||
| return 0; | |||||
| } | |||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| @@ -1,45 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PLUGIN_IMPL_H_ | |||||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PLUGIN_IMPL_H_ | |||||
| #include <string> | |||||
| #include "./prof_engine.h" | |||||
| using Msprof::Engine::PluginIntf; | |||||
| using Msprof::Engine::Reporter; | |||||
| using std::string; | |||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| class PluginImpl : public PluginIntf { | |||||
| public: | |||||
| explicit PluginImpl(const std::string &module); | |||||
| ~PluginImpl() override = default; | |||||
| int Init(const Reporter *reporter) override; | |||||
| int UnInit() override; | |||||
| static Reporter *GetPluginReporter() { return reporter_; } | |||||
| private: | |||||
| static Reporter *reporter_; | |||||
| std::string module_; | |||||
| }; | |||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PLUGIN_IMPL_H_ | |||||
| @@ -0,0 +1,93 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "runtime/device/ascend/profiling/profiling_callback_register.h" | |||||
| #include "runtime/base.h" | |||||
| namespace Analysis { | |||||
| namespace Dvvp { | |||||
| namespace ProfilerCommon { | |||||
| extern int32_t MsprofilerInit(); | |||||
| } // namespace ProfilerCommon | |||||
| } // namespace Dvvp | |||||
| } // namespace Analysis | |||||
| namespace { | |||||
| constexpr Status PROF_SUCCESS = 0; | |||||
| constexpr Status PROF_FAILED = 0xFFFFFFFF; | |||||
| } // namespace | |||||
| Status RegProfCtrlCallback(MsprofCtrlCallback func) { | |||||
| if (VMCallbackRegister::GetInstance().registed()) { | |||||
| return VMCallbackRegister::GetInstance().DoRegProfCtrlCallback(func); | |||||
| } else { | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| } | |||||
| Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { | |||||
| if (VMCallbackRegister::GetInstance().registed()) { | |||||
| return VMCallbackRegister::GetInstance().DoRegProfSetDeviceCallback(func); | |||||
| } else { | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| } | |||||
| Status RegProfReporterCallback(MsprofReporterCallback func) { | |||||
| if (VMCallbackRegister::GetInstance().registed()) { | |||||
| return VMCallbackRegister::GetInstance().DoRegProfReporterCallback(func); | |||||
| } else { | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| } | |||||
| Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { | |||||
| if (VMCallbackRegister::GetInstance().registed()) { | |||||
| return VMCallbackRegister::GetInstance().DoProfCommandHandle(type, data, len); | |||||
| } else { | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| } | |||||
| bool IsInitialize() { return true; } | |||||
| VMCallbackRegister &VMCallbackRegister::GetInstance() { | |||||
| static VMCallbackRegister instance; | |||||
| return instance; | |||||
| } | |||||
| bool VMCallbackRegister::Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback), | |||||
| Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback), | |||||
| Status (*pRegProfReporterCallback)(MsprofReporterCallback), | |||||
| Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t)) { | |||||
| if (!registed_) { | |||||
| pRegProfCtrlCallback_ = pRegProfCtrlCallback; | |||||
| pRegProfSetDeviceCallback_ = pRegProfSetDeviceCallback; | |||||
| pRegProfReporterCallback_ = pRegProfReporterCallback; | |||||
| pProfCommandHandle_ = pProfCommandHandle; | |||||
| registed_ = true; | |||||
| ForceMsprofilerInit(); | |||||
| return true; | |||||
| } | |||||
| return false; | |||||
| } | |||||
| void VMCallbackRegister::ForceMsprofilerInit() { | |||||
| if (!ms_profile_inited_) { | |||||
| Analysis::Dvvp::ProfilerCommon::MsprofilerInit(); | |||||
| ms_profile_inited_ = true; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,82 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_CALLBACK_REGISTER_H_ | |||||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_CALLBACK_REGISTER_H_ | |||||
| #include "toolchain/prof_callback.h" | |||||
| #define MAX_DEV_NUM (64) | |||||
| using Status = uint32_t; | |||||
| enum ProfCommandHandleType { | |||||
| kProfCommandhandleInit = 0, | |||||
| kProfCommandhandleStart, | |||||
| kProfCommandhandleStop, | |||||
| kProfCommandhandleFinalize, | |||||
| kProfCommandhandleModelSubscribe, | |||||
| kProfCommandhandleModelUnsubscribe | |||||
| }; | |||||
| struct ProfCommandHandleData { | |||||
| uint64_t profSwitch; | |||||
| uint32_t devNums; // length of device id list | |||||
| uint32_t devIdList[MAX_DEV_NUM]; | |||||
| uint32_t modelId; | |||||
| }; | |||||
| Status RegProfCtrlCallback(MsprofCtrlCallback func); | |||||
| Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func); | |||||
| Status RegProfReporterCallback(MsprofReporterCallback func); | |||||
| Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len); | |||||
| bool IsInitialize(); | |||||
| class __attribute__((visibility("default"))) VMCallbackRegister { | |||||
| public: | |||||
| static VMCallbackRegister &GetInstance(); | |||||
| VMCallbackRegister(const VMCallbackRegister &) = delete; | |||||
| VMCallbackRegister &operator=(const VMCallbackRegister &) = delete; | |||||
| bool Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback), | |||||
| Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback), | |||||
| Status (*pRegProfReporterCallback)(MsprofReporterCallback), | |||||
| Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t)); | |||||
| void ForceMsprofilerInit(); | |||||
| bool registed() { return registed_; } | |||||
| Status DoRegProfCtrlCallback(MsprofCtrlCallback func) { return pRegProfCtrlCallback_(func); } | |||||
| Status DoRegProfSetDeviceCallback(MsprofSetDeviceCallback func) { return pRegProfSetDeviceCallback_(func); } | |||||
| Status DoRegProfReporterCallback(MsprofReporterCallback func) { return pRegProfReporterCallback_(func); } | |||||
| Status DoProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { | |||||
| return pProfCommandHandle_(type, data, len); | |||||
| } | |||||
| private: | |||||
| VMCallbackRegister() | |||||
| : registed_(false), | |||||
| ms_profile_inited_(false), | |||||
| pRegProfCtrlCallback_(nullptr), | |||||
| pRegProfSetDeviceCallback_(nullptr), | |||||
| pRegProfReporterCallback_(nullptr), | |||||
| pProfCommandHandle_(nullptr) {} | |||||
| ~VMCallbackRegister() = default; | |||||
| bool registed_; | |||||
| bool ms_profile_inited_; | |||||
| Status (*pRegProfCtrlCallback_)(MsprofCtrlCallback); | |||||
| Status (*pRegProfSetDeviceCallback_)(MsprofSetDeviceCallback); | |||||
| Status (*pRegProfReporterCallback_)(MsprofReporterCallback); | |||||
| Status (*pProfCommandHandle_)(ProfCommandHandleType, void *, uint32_t); | |||||
| }; | |||||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_CALLBACK_REGISTER_H_ | |||||
| @@ -1,37 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "runtime/device/ascend/profiling/profiling_engine_impl.h" | |||||
| #include "utils/log_adapter.h" | |||||
| #include "runtime/device/ascend/profiling/plugin_impl.h" | |||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| PluginIntf *ProfilingEngineImpl::CreatePlugin() { | |||||
| MS_LOG(INFO) << "Create Plugin."; | |||||
| return new (std::nothrow) PluginImpl("Framework"); | |||||
| } | |||||
| int ProfilingEngineImpl::ReleasePlugin(PluginIntf *plugin) { | |||||
| if (plugin != nullptr) { | |||||
| delete plugin; | |||||
| plugin = nullptr; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| @@ -1,39 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_ENGINE_IMPL_H_ | |||||
| #define MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_ENGINE_IMPL_H_ | |||||
| #include "./prof_engine.h" | |||||
| using Msprof::Engine::EngineIntf; | |||||
| using Msprof::Engine::PluginIntf; | |||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| class ProfilingEngineImpl : public EngineIntf { | |||||
| public: | |||||
| ProfilingEngineImpl() = default; | |||||
| ~ProfilingEngineImpl() override = default; | |||||
| PluginIntf *CreatePlugin() override; | |||||
| int ReleasePlugin(PluginIntf *plugin) override; | |||||
| }; | |||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_ENGINE_IMPL_H_ | |||||
| @@ -19,18 +19,20 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include "securec/include/securec.h" | #include "securec/include/securec.h" | ||||
| #include "./prof_mgr_core.h" | #include "./prof_mgr_core.h" | ||||
| #include "runtime/device/ascend/profiling/plugin_impl.h" | |||||
| #include "runtime/device/ascend/profiling/profiling_engine_impl.h" | |||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| #include "utils/ms_context.h" | #include "utils/ms_context.h" | ||||
| #include "utils/ms_utils.h" | #include "utils/ms_utils.h" | ||||
| #include "utils/convert_utils.h" | #include "utils/convert_utils.h" | ||||
| #include "runtime/base.h" | #include "runtime/base.h" | ||||
| #include "toolchain/prof_acl_api.h" | #include "toolchain/prof_acl_api.h" | ||||
| #include "runtime/device/ascend/profiling/profiling_callback_register.h" | |||||
| namespace { | namespace { | ||||
| constexpr uint32_t kProfilingDeviceNum = 1; | constexpr uint32_t kProfilingDeviceNum = 1; | ||||
| } | |||||
| constexpr auto kRtSetDeviceRegName = "profiling"; | |||||
| constexpr Status PROF_SUCCESS = 0; | |||||
| constexpr Status PROF_FAILED = 0xFFFFFFFF; | |||||
| } // namespace | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| @@ -40,9 +42,7 @@ ProfilingManager &ProfilingManager::GetInstance() { | |||||
| return inst; | return inst; | ||||
| } | } | ||||
| ProfilingManager::ProfilingManager() : device_id_(0), prof_handle_(nullptr) { | |||||
| engine_0_ = std::make_shared<ProfilingEngineImpl>(); | |||||
| } | |||||
| ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}) {} | |||||
| uint64_t ProfilingManager::GetJobId() const { | uint64_t ProfilingManager::GetJobId() const { | ||||
| const char *job_id = std::getenv("JOB_ID"); | const char *job_id = std::getenv("JOB_ID"); | ||||
| @@ -58,14 +58,10 @@ bool ProfilingManager::ReportProfilingData(const map<uint32_t, string> &op_taskI | |||||
| MS_LOG(WARNING) << "op_taskId_map is empty."; | MS_LOG(WARNING) << "op_taskId_map is empty."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| auto reporter = PluginImpl::GetPluginReporter(); | |||||
| if (reporter == nullptr) { | |||||
| MS_LOG(ERROR) << "No profiling data report!"; | |||||
| return false; | |||||
| } | |||||
| MS_LOG(INFO) << "DistributeTask: op tasId map size = " << op_taskId_map.size(); | MS_LOG(INFO) << "DistributeTask: op tasId map size = " << op_taskId_map.size(); | ||||
| Msprof::Engine::ReporterData reporter_data = {}; | |||||
| ReporterData reporter_data = {}; | |||||
| for (const auto &iter : op_taskId_map) { | for (const auto &iter : op_taskId_map) { | ||||
| auto data = iter.second + ' ' + std::to_string(iter.first) + ';'; | auto data = iter.second + ' ' + std::to_string(iter.first) + ';'; | ||||
| reporter_data.deviceId = UintToInt(device_id_); | reporter_data.deviceId = UintToInt(device_id_); | ||||
| @@ -76,41 +72,65 @@ bool ProfilingManager::ReportProfilingData(const map<uint32_t, string> &op_taskI | |||||
| MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")"; | MS_LOG(ERROR) << "memcpy_s error, errorno(" << ret << ")"; | ||||
| return false; | return false; | ||||
| } | } | ||||
| ret = reporter->Report(&reporter_data); | |||||
| if (ret != 0) { | |||||
| MS_LOG(ERROR) << "reporter data fail, errorno(" << ret << ")"; | |||||
| int32_t cb_ret = CallMsprofReport(NOT_NULL(&reporter_data)); | |||||
| if (cb_ret != 0) { | |||||
| MS_LOG(ERROR) << "reporter data fail, errorno(" << cb_ret << ")"; | |||||
| return false; | return false; | ||||
| } | } | ||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| static std::vector<std::string> Split(const std::string &str, const char delim) { | |||||
| std::vector<std::string> elems; | |||||
| uint64_t GetProfilingModule() { | |||||
| return PROF_MODEL_EXECUTE_MASK | PROF_RUNTIME_API_MASK | PROF_RUNTIME_TRACE_MASK | PROF_SCHEDULE_TIMELINE_MASK | | |||||
| PROF_SCHEDULE_TRACE_MASK | PROF_TASK_TIME_MASK | PROF_SUBTASK_TIME_MASK | PROF_AICPU_TRACE_MASK | | |||||
| PROF_AICORE_METRICS_MASK | PROF_AIVECTORCORE_METRICS_MASK | PROF_MODEL_LOAD_MASK; | |||||
| } | |||||
| Status ProfilingManager::PluginInit() const { | |||||
| if (prof_cb_.msprofReporterCallback == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| return prof_cb_.msprofReporterCallback(static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), | |||||
| static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_INIT), | |||||
| nullptr, 0); | |||||
| } | |||||
| if (str.empty()) { | |||||
| elems.emplace_back(""); | |||||
| return elems; | |||||
| void ProfilingManager::PluginUnInit() const { | |||||
| if (prof_cb_.msprofReporterCallback == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; | |||||
| return; | |||||
| } | } | ||||
| int32_t cb_ret = prof_cb_.msprofReporterCallback( | |||||
| static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), | |||||
| static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT), nullptr, 0); | |||||
| if (cb_ret != 0) { | |||||
| MS_LOG(WARNING) << "profiling plugin uninit failed, ret:%d" << cb_ret; | |||||
| } | |||||
| } | |||||
| std::stringstream ss(str); | |||||
| std::string item; | |||||
| Status ProfilingManager::GetProfConf(NotNull<MsprofGeOptions *> prof) { | |||||
| string job_id = std::to_string(GetJobId()); | |||||
| while (getline(ss, item, delim)) { | |||||
| elems.push_back(item); | |||||
| if (memcpy_s(prof->jobId, sizeof(prof->jobId), job_id.c_str(), sizeof(job_id.c_str())) != EOK) { | |||||
| MS_LOG(ERROR) << "Copy job_id failed."; | |||||
| return PROF_FAILED; | |||||
| } | } | ||||
| auto str_size = str.size(); | |||||
| if (str_size > 0 && str[str_size - 1] == delim) { | |||||
| elems.emplace_back(""); | |||||
| auto context = MsContext::GetInstance(); | |||||
| if (context == nullptr) { | |||||
| MS_LOG(ERROR) << "Context is nullptr."; | |||||
| return PROF_FAILED; | |||||
| } | } | ||||
| return elems; | |||||
| } | |||||
| const string prof_options_str = context->get_param<std::string>(MS_CTX_PROFILING_OPTIONS); | |||||
| uint64_t GetProfilingModule() { | |||||
| return PROF_MODEL_EXECUTE_MASK | PROF_RUNTIME_API_MASK | PROF_RUNTIME_TRACE_MASK | PROF_SCHEDULE_TIMELINE_MASK | | |||||
| PROF_SCHEDULE_TRACE_MASK | PROF_TASK_TIME_MASK | PROF_SUBTASK_TIME_MASK | PROF_AICPU_TRACE_MASK | | |||||
| PROF_AICORE_METRICS_MASK | PROF_AIVECTORCORE_METRICS_MASK | PROF_MODEL_LOAD_MASK; | |||||
| if (memcpy_s(prof->options, MSPROF_OPTIONS_DEF_LEN_MAX, prof_options_str.c_str(), prof_options_str.size()) != EOK) { | |||||
| MS_LOG(ERROR) << "Copy profiling_options failed"; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| return PROF_SUCCESS; | |||||
| } | } | ||||
| bool ProfilingManager::StartupProfiling(uint32_t device_id) { | bool ProfilingManager::StartupProfiling(uint32_t device_id) { | ||||
| @@ -120,42 +140,14 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) { | |||||
| return true; | return true; | ||||
| } | } | ||||
| device_id_ = device_id; | device_id_ = device_id; | ||||
| // register Framework to profiling | |||||
| int result = Msprof::Engine::RegisterEngine("Framework", engine_0_.get()); | |||||
| if (result != 0) { | |||||
| MS_LOG(ERROR) << "Register profiling Engine failed."; | |||||
| struct MsprofGeOptions prof_conf = {0}; | |||||
| if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) { | |||||
| MS_LOG(ERROR) << "Get prof conf failed."; | |||||
| return false; | return false; | ||||
| } | } | ||||
| auto context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context); | |||||
| const string prof_options_str = context->get_param<std::string>(MS_CTX_PROFILING_OPTIONS); | |||||
| std::vector<string> opts = Split(prof_options_str, ':'); | |||||
| if (opts.empty()) { | |||||
| MS_LOG(WARNING) << "Profiling is enabled, but profiling option is not set!"; | |||||
| return true; | |||||
| } | |||||
| // current one docker only use one device` | |||||
| nlohmann::json p_device; | |||||
| // JOBID | |||||
| auto job_id = GetJobId(); | |||||
| p_device["jobID"] = std::to_string(job_id); | |||||
| // device_id | |||||
| p_device["deviceID"] = std::to_string(device_id); | |||||
| // features:'training_trace', 'task_trace' etc | |||||
| nlohmann::json features; | |||||
| for (std::vector<string>::size_type i = 0; i < opts.size(); i++) { | |||||
| nlohmann::json f; | |||||
| f["name"] = opts[i]; | |||||
| features[i] = f; | |||||
| } | |||||
| p_device["features"] = features; | |||||
| // only one device, but sProfMgrStartUp API require for device list | |||||
| nlohmann::json devices; | |||||
| devices[0] = p_device; | |||||
| nlohmann::json startCfg; | |||||
| startCfg["startCfg"] = devices; | |||||
| if (!ProfStartUp(startCfg)) { | |||||
| if (!ProfStartUp(NOT_NULL(&prof_conf))) { | |||||
| MS_LOG(ERROR) << "ProfMgrStartUp failed."; | MS_LOG(ERROR) << "ProfMgrStartUp failed."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -168,28 +160,24 @@ uint32_t GetCurrentDeviceId() { | |||||
| return context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | return context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | ||||
| } | } | ||||
| bool ProfilingManager::ProfStartUp(const nlohmann::json &startCfg) { | |||||
| // convert json to string | |||||
| std::stringstream ss; | |||||
| ss << startCfg; | |||||
| std::string cfg = ss.str(); | |||||
| MS_LOG(INFO) << "profiling config " << cfg; | |||||
| bool ProfilingManager::ProfStartUp(NotNull<MsprofGeOptions *> prof_conf) { | |||||
| MS_LOG(INFO) << "Prof start up. "; | |||||
| auto module = GetProfilingModule(); | |||||
| auto device_id = GetCurrentDeviceId(); | |||||
| auto ret = rtProfilerStart(module, kProfilingDeviceNum, &device_id); | |||||
| if (ret != RT_ERROR_NONE) { | |||||
| MS_LOG(INFO) << "Call rtProfilerStart failed, ret:" << ret; | |||||
| if (prof_cb_.msprofCtrlCallback == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofCtrlCallback callback is nullptr."; | |||||
| return false; | return false; | ||||
| } | } | ||||
| // call profiling startup API | |||||
| ProfMgrCfg prof_cfg = {cfg}; | |||||
| prof_handle_ = ProfMgrStartUp(&prof_cfg); | |||||
| if (prof_handle_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Startup profiling failed."; | |||||
| // call profiling start up api | |||||
| int32_t cb_ret = | |||||
| prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), | |||||
| static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions)); | |||||
| if (cb_ret != PROF_SUCCESS) { | |||||
| MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret; | |||||
| return false; | return false; | ||||
| } | } | ||||
| MS_LOG(INFO) << "Start up profiling success."; | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -199,12 +187,10 @@ bool ProfilingManager::StopProfiling() { | |||||
| MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode."; | MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode."; | ||||
| return true; | return true; | ||||
| } | } | ||||
| Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter(); | |||||
| if (reporter != nullptr) { | |||||
| auto ret = reporter->Flush(); | |||||
| MS_LOG(INFO) << "report data end, ret = " << ret; | |||||
| } | |||||
| // plugin unregister | |||||
| PluginUnInit(); | |||||
| // stop runtime profiler | |||||
| auto module = GetProfilingModule(); | auto module = GetProfilingModule(); | ||||
| uint32_t device_ids[kProfilingDeviceNum] = {GetCurrentDeviceId()}; | uint32_t device_ids[kProfilingDeviceNum] = {GetCurrentDeviceId()}; | ||||
| @@ -214,18 +200,109 @@ bool ProfilingManager::StopProfiling() { | |||||
| return false; | return false; | ||||
| } | } | ||||
| if (prof_handle_ != nullptr) { | |||||
| int result = ProfMgrStop(prof_handle_); | |||||
| if (result != 0) { | |||||
| MS_LOG(ERROR) << "ProfMgr stop return fail:" << result << "."; | |||||
| prof_handle_ = nullptr; | |||||
| return false; | |||||
| } | |||||
| prof_handle_ = nullptr; | |||||
| // stop profiling | |||||
| if (prof_cb_.msprofCtrlCallback == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofCtrlCallback callback is nullptr."; | |||||
| return false; | |||||
| } | } | ||||
| int32_t cb_ret = | |||||
| prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), nullptr, 0); | |||||
| if (cb_ret != 0) { | |||||
| MS_LOG(WARNING) << "Call msprofCtrlCallback failed, ret: " << cb_ret; | |||||
| return false; | |||||
| } | |||||
| return true; | return true; | ||||
| } | } | ||||
| Status ProfilingManager::CallMsprofReport(NotNull<ReporterData *> reporter_data) const { | |||||
| if (prof_cb_.msprofReporterCallback == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| return prof_cb_.msprofReporterCallback(static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), | |||||
| static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT), | |||||
| static_cast<void *>(reporter_data.get()), sizeof(ReporterData)); | |||||
| } | |||||
| Status RegProfCtrlCallback(MsprofCtrlCallback func) { | |||||
| if (func == nullptr) { | |||||
| MS_LOG(ERROR) << "Msprof ctrl callback is nullptr."; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| if (ProfilingManager::GetInstance().GetMsprofCallback().msprofCtrlCallback != nullptr) { | |||||
| MS_LOG(WARNING) << "Msprof ctrl callback is exist, just ignore it."; | |||||
| } else { | |||||
| MS_LOG(INFO) << "GE register Msprof ctrl callback."; | |||||
| ProfilingManager::GetInstance().SetMsprofCtrlCallback(func); | |||||
| } | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { | |||||
| if (func == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofSetDeviceCallback callback is nullptr."; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| ProfilingManager::GetInstance().SetMsprofSetDeviceCallback(func); | |||||
| // Pass MsprofSetDeviceCallback to runtime | |||||
| MS_LOG(INFO) << "GE pass setdevice callback to runtime."; | |||||
| Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName, static_cast<rtDeviceStateCallback>(func)); | |||||
| if (rt_ret != PROF_SUCCESS) { | |||||
| MS_LOG(ERROR) << "Pass MsprofSetDeviceCallback to runtime failed!"; | |||||
| return rt_ret; | |||||
| } | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| Status RegProfReporterCallback(MsprofReporterCallback func) { | |||||
| if (func == nullptr) { | |||||
| MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| if (ProfilingManager::GetInstance().GetMsprofCallback().msprofReporterCallback != nullptr) { | |||||
| MS_LOG(WARNING) << "Msprof reporter callback is exist, just ignore it."; | |||||
| } else { | |||||
| MS_LOG(INFO) << "GE register Msprof reporter callback."; | |||||
| ProfilingManager::GetInstance().SetMsprofReporterCallback(func); | |||||
| // Pass MsprofReporterCallback to runtime | |||||
| Status rt_ret = rtSetMsprofReporterCallback(func); | |||||
| if (rt_ret != PROF_SUCCESS) { | |||||
| MS_LOG(ERROR) << "Pass MsprofReporterCallback to runtime failed, ret: " << rt_ret; | |||||
| return rt_ret; | |||||
| } | |||||
| // Pass MsprofReporterCallback to hccl | |||||
| } | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { | |||||
| MS_LOG(INFO) << "ProfCommandHandle start, type:" << type; | |||||
| if (type == kProfCommandhandleInit) { | |||||
| auto cb_ret = ProfilingManager::GetInstance().PluginInit(); | |||||
| if (cb_ret != PROF_SUCCESS) { | |||||
| MS_LOG(ERROR) << "Profiling plugin int failed."; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| // call runtime profiler API | |||||
| auto module = GetProfilingModule(); | |||||
| auto device_id = GetCurrentDeviceId(); | |||||
| auto ret = rtProfilerStart(module, kProfilingDeviceNum, &device_id); | |||||
| if (ret != RT_ERROR_NONE) { | |||||
| MS_LOG(ERROR) << "Call rtProfilerStart failed, ret:" << ret; | |||||
| return PROF_FAILED; | |||||
| } | |||||
| } | |||||
| return PROF_SUCCESS; | |||||
| } | |||||
| bool DoRegiste() { | |||||
| MS_LOG(INFO) << "VM profiling register start"; | |||||
| return VMCallbackRegister::GetInstance().Registe(RegProfCtrlCallback, RegProfSetDeviceCallback, | |||||
| RegProfReporterCallback, ProfCommandHandle); | |||||
| } | |||||
| static bool doRegiste = DoRegiste(); | |||||
| } // namespace ascend | } // namespace ascend | ||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -23,13 +23,21 @@ | |||||
| #include <nlohmann/json.hpp> | #include <nlohmann/json.hpp> | ||||
| #include "utils/contract.h" | #include "utils/contract.h" | ||||
| #include "utils/ms_context.h" | #include "utils/ms_context.h" | ||||
| #include "toolchain/prof_callback.h" | |||||
| #include "runtime/device/ascend/profiling/profiling_callback_register.h" | |||||
| using std::map; | using std::map; | ||||
| using std::string; | using std::string; | ||||
| using Status = uint32_t; | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | namespace ascend { | ||||
| class ProfilingEngineImpl; | |||||
| struct MsprofCallback { | |||||
| MsprofCtrlCallback msprofCtrlCallback; | |||||
| MsprofSetDeviceCallback msprofSetDeviceCallback; | |||||
| MsprofReporterCallback msprofReporterCallback; | |||||
| }; | |||||
| class ProfilingManager { | class ProfilingManager { | ||||
| public: | public: | ||||
| static ProfilingManager &GetInstance(); | static ProfilingManager &GetInstance(); | ||||
| @@ -43,17 +51,31 @@ class ProfilingManager { | |||||
| MS_EXCEPTION_IF_NULL(context); | MS_EXCEPTION_IF_NULL(context); | ||||
| return context->get_param<bool>(MS_CTX_ENABLE_PROFILING); | return context->get_param<bool>(MS_CTX_ENABLE_PROFILING); | ||||
| } | } | ||||
| Status PluginInit() const; | |||||
| void PluginUnInit() const; | |||||
| Status CallMsprofReport(NotNull<ReporterData *> reporter_data) const; | |||||
| struct MsprofCallback &GetMsprofCallback() { | |||||
| return prof_cb_; | |||||
| } | |||||
| void SetMsprofCtrlCallback(MsprofCtrlCallback func) { prof_cb_.msprofCtrlCallback = func; } | |||||
| void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; } | |||||
| void SetMsprofSetDeviceCallback(MsprofSetDeviceCallback func) { prof_cb_.msprofSetDeviceCallback = func; } | |||||
| Status GetProfConf(NotNull<MsprofGeOptions *> prof); | |||||
| protected: | protected: | ||||
| ProfilingManager(); | ProfilingManager(); | ||||
| ~ProfilingManager() { prof_handle_ = nullptr; } | |||||
| ~ProfilingManager() {} | |||||
| private: | private: | ||||
| bool ProfStartUp(const nlohmann::json &json); | |||||
| std::shared_ptr<ProfilingEngineImpl> engine_0_; | |||||
| bool ProfStartUp(NotNull<MsprofGeOptions *> prof_conf); | |||||
| uint32_t device_id_; | uint32_t device_id_; | ||||
| void *prof_handle_; | |||||
| MsprofCallback prof_cb_; | |||||
| }; | }; | ||||
| Status RegProfCtrlCallback(MsprofCtrlCallback func); | |||||
| Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func); | |||||
| Status RegProfReporterCallback(MsprofReporterCallback func); | |||||
| Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len); | |||||
| } // namespace ascend | } // namespace ascend | ||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -24,14 +24,15 @@ | |||||
| #include "runtime/device/ascend/profiling/reporter/task_desc_reporter.h" | #include "runtime/device/ascend/profiling/reporter/task_desc_reporter.h" | ||||
| #include "utils/ms_context.h" | #include "utils/ms_context.h" | ||||
| #include "runtime/device/ascend/profiling/reporter/point_reporter.h" | #include "runtime/device/ascend/profiling/reporter/point_reporter.h" | ||||
| #include "nlohmann/json.hpp" | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace device { | namespace device { | ||||
| namespace ascend { | namespace ascend { | ||||
| constexpr uint32_t kMaxProfilingNodeNum = 100; | constexpr uint32_t kMaxProfilingNodeNum = 100; | ||||
| constexpr char kCustomNode[] = "PROFILING_CUSTOM_"; | constexpr char kCustomNode[] = "PROFILING_CUSTOM_"; | ||||
| constexpr char kFpStartNode[] = "PROFILING_FP_START"; | |||||
| constexpr char kBpEndNode[] = "PROFILING_BP_END"; | |||||
| constexpr char kFpStartNode[] = "fp_point"; | |||||
| constexpr char kBpEndNode[] = "bp_point"; | |||||
| constexpr char kIterEndNode[] = "PROFILING_ITER_END"; | constexpr char kIterEndNode[] = "PROFILING_ITER_END"; | ||||
| // PROFILING_CUSTOM_LOGID_START 3 | // PROFILING_CUSTOM_LOGID_START 3 | ||||
| constexpr uint64_t kProfilingFpStartLogId = 1; | constexpr uint64_t kProfilingFpStartLogId = 1; | ||||
| @@ -42,14 +43,29 @@ std::map<uint32_t, std::vector<std::string>> ProfilingUtils::graph_kernel_name_; | |||||
| std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> ProfilingUtils::graph_point_; | std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> ProfilingUtils::graph_point_; | ||||
| uint32_t ProfilingUtils::custom_node_index_ = 1; | uint32_t ProfilingUtils::custom_node_index_ = 1; | ||||
| nlohmann::json GetContextProfilingOption() { | |||||
| auto context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context); | |||||
| const string prof_options_str = context->get_param<std::string>(MS_CTX_PROFILING_OPTIONS); | |||||
| nlohmann::json j; | |||||
| try { | |||||
| j = nlohmann::json::parse(prof_options_str); | |||||
| } catch (nlohmann::json::parse_error &e) { | |||||
| MS_LOG(EXCEPTION) << "Parse profiling option json failed, error:" << e.what(); | |||||
| } | |||||
| return j; | |||||
| } | |||||
| ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr) { | ProfilingTraceInfo ProfilingUtils::GetProfilingTraceFromEnv(NotNull<const session::KernelGraph *> graph_ptr) { | ||||
| MS_LOG(INFO) << "get env start"; | MS_LOG(INFO) << "get env start"; | ||||
| custom_node_index_ = 1; | custom_node_index_ = 1; | ||||
| auto &cnode_exec_order = graph_ptr->execution_order(); | auto &cnode_exec_order = graph_ptr->execution_order(); | ||||
| auto profiling_option = GetContextProfilingOption(); | |||||
| ProfilingTraceInfo profiling_trace; | ProfilingTraceInfo profiling_trace; | ||||
| profiling_trace.trace_begin = GetTraceBegin(cnode_exec_order); | |||||
| profiling_trace.trace_bp_end = GetTraceBpEnd(cnode_exec_order); | |||||
| profiling_trace.trace_netoutput = GetTraceNetoutput(cnode_exec_order); | |||||
| profiling_trace.trace_begin = GetTraceBegin(cnode_exec_order, profiling_option); | |||||
| profiling_trace.trace_bp_end = GetTraceBpEnd(cnode_exec_order, profiling_option); | |||||
| profiling_trace.trace_netoutput = GetTraceNetoutput(cnode_exec_order, profiling_option); | |||||
| for (uint32_t i = 1; i <= kMaxProfilingNodeNum; ++i) { | for (uint32_t i = 1; i <= kMaxProfilingNodeNum; ++i) { | ||||
| std::string env_str = std::string(kCustomNode) + std::to_string(i); | std::string env_str = std::string(kCustomNode) + std::to_string(i); | ||||
| @@ -80,10 +96,14 @@ void ProfilingUtils::GetTraceHccl(const std::vector<CNodePtr> &cnode_exec_order, | |||||
| } | } | ||||
| } | } | ||||
| std::string ProfilingUtils::GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order) { | |||||
| const char *trace_begin = std::getenv(kFpStartNode); | |||||
| if (trace_begin != nullptr) { | |||||
| return std::string(trace_begin); | |||||
| std::string ProfilingUtils::GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order, const nlohmann::json &option) { | |||||
| auto iter = option.find(kFpStartNode); | |||||
| if (iter != option.end() && iter->is_string()) { | |||||
| std::string trace_begin_str = *iter; | |||||
| if (!trace_begin_str.empty()) { | |||||
| MS_LOG(INFO) << "Get fp_point from profiling_option:" << trace_begin_str; | |||||
| return trace_begin_str; | |||||
| } | |||||
| } | } | ||||
| std::string fp_start_str; | std::string fp_start_str; | ||||
| @@ -124,12 +144,16 @@ void ProfilingUtils::GetCNodeOutputRealNode(const std::string &node_name, const | |||||
| } | } | ||||
| } | } | ||||
| std::string ProfilingUtils::GetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order) { | |||||
| const char *trace_bp_end = std::getenv(kBpEndNode); | |||||
| if (trace_bp_end != nullptr) { | |||||
| return std::string(trace_bp_end); | |||||
| std::string ProfilingUtils::GetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order, const nlohmann::json &option) { | |||||
| auto bp_point = option.find(kBpEndNode); | |||||
| if (bp_point != option.end() && bp_point->is_string()) { | |||||
| std::string bp_point_str = *bp_point; | |||||
| if (!bp_point_str.empty()) { | |||||
| MS_LOG(INFO) << "Get bp_point from profiling_option:" << bp_point_str; | |||||
| return bp_point_str; | |||||
| } | |||||
| } | } | ||||
| std::string bp_end_str; | std::string bp_end_str; | ||||
| // Contain hccl kernel | // Contain hccl kernel | ||||
| auto iter = cnode_exec_order.rbegin(); | auto iter = cnode_exec_order.rbegin(); | ||||
| @@ -179,9 +203,17 @@ std::string ProfilingUtils::GetGraphLastTbeKernelName(const std::vector<CNodePtr | |||||
| return last_tbe_kernel_name; | return last_tbe_kernel_name; | ||||
| } | } | ||||
| std::string ProfilingUtils::GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order) { | |||||
| const char *trace_netoutput = std::getenv(kIterEndNode); | |||||
| return trace_netoutput == nullptr ? GetGraphLastTbeKernelName(cnode_exec_order) : std::string(trace_netoutput); | |||||
| std::string ProfilingUtils::GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order, | |||||
| const nlohmann::json &option) { | |||||
| auto iter_end = option.find(kIterEndNode); | |||||
| if (iter_end != option.end() && iter_end->is_string()) { | |||||
| std::string iter_end_str = *iter_end; | |||||
| if (!iter_end_str.empty()) { | |||||
| MS_LOG(INFO) << "Get iter_end from profiling_option:" << iter_end_str; | |||||
| return iter_end_str; | |||||
| } | |||||
| } | |||||
| return GetGraphLastTbeKernelName(cnode_exec_order); | |||||
| } | } | ||||
| NotNull<CNodePtr> ProfilingUtils::CreateProfilingCNode(const ProfilingContent &profiling_content, | NotNull<CNodePtr> ProfilingUtils::CreateProfilingCNode(const ProfilingContent &profiling_content, | ||||
| @@ -118,9 +118,9 @@ class ProfilingUtils { | |||||
| NotNull<session::KernelGraph *> graph_ptr); | NotNull<session::KernelGraph *> graph_ptr); | ||||
| static CNodePtr CreateProfilingCNodeWithStream(const AnfNodePtr &anf_node, const ProfilingContent &profiling_content, | static CNodePtr CreateProfilingCNodeWithStream(const AnfNodePtr &anf_node, const ProfilingContent &profiling_content, | ||||
| NotNull<session::KernelGraph *> graph_ptr); | NotNull<session::KernelGraph *> graph_ptr); | ||||
| static std::string GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order); | |||||
| static std::string GetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order); | |||||
| static std::string GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order); | |||||
| static std::string GetTraceBegin(const std::vector<CNodePtr> &cnode_exec_order, const nlohmann::json &option); | |||||
| static std::string GetTraceBpEnd(const std::vector<CNodePtr> &cnode_exec_order, const nlohmann::json &option); | |||||
| static std::string GetTraceNetoutput(const std::vector<CNodePtr> &cnode_exec_order, const nlohmann::json &option); | |||||
| static std::string GetGraphLastTbeKernelName(const std::vector<CNodePtr> &cnode_exec_order); | static std::string GetGraphLastTbeKernelName(const std::vector<CNodePtr> &cnode_exec_order); | ||||
| static void GetTraceHccl(const std::vector<CNodePtr> &cnode_exec_order, | static void GetTraceHccl(const std::vector<CNodePtr> &cnode_exec_order, | ||||
| NotNull<ProfilingTraceInfo *> profiling_trace); | NotNull<ProfilingTraceInfo *> profiling_trace); | ||||
| @@ -16,7 +16,7 @@ | |||||
| #include <algorithm> | #include <algorithm> | ||||
| #include "runtime/device/ascend/profiling/reporter/desc_reporter.h" | #include "runtime/device/ascend/profiling/reporter/desc_reporter.h" | ||||
| #include "runtime/device/ascend/profiling/plugin_impl.h" | |||||
| #include "runtime/device/ascend/profiling/profiling_manager.h" | |||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| constexpr size_t kReportMaxLen = 2048; | constexpr size_t kReportMaxLen = 2048; | ||||
| @@ -27,16 +27,13 @@ namespace ascend { | |||||
| DescReporter::~DescReporter() = default; | DescReporter::~DescReporter() = default; | ||||
| void DescReporter::ReportByLine(const std::string &data, const std::string &file_name) const { | void DescReporter::ReportByLine(const std::string &data, const std::string &file_name) const { | ||||
| auto reporter = PluginImpl::GetPluginReporter(); | |||||
| MS_EXCEPTION_IF_NULL(reporter); | |||||
| auto tot_size = data.size(); | auto tot_size = data.size(); | ||||
| size_t cur_size = 0; | size_t cur_size = 0; | ||||
| while (cur_size < tot_size) { | while (cur_size < tot_size) { | ||||
| size_t remain_size = tot_size - cur_size; | size_t remain_size = tot_size - cur_size; | ||||
| size_t report_size = std::min(remain_size, kReportMaxLen); | size_t report_size = std::min(remain_size, kReportMaxLen); | ||||
| Msprof::Engine::ReporterData report_data{}; | |||||
| ReporterData report_data{}; | |||||
| report_data.deviceId = device_id_; | report_data.deviceId = device_id_; | ||||
| report_data.dataLen = report_size; | report_data.dataLen = report_size; | ||||
| report_data.data = (unsigned char *)data.c_str() + cur_size; | report_data.data = (unsigned char *)data.c_str() + cur_size; | ||||
| @@ -44,7 +41,7 @@ void DescReporter::ReportByLine(const std::string &data, const std::string &file | |||||
| if (ret != 0) { | if (ret != 0) { | ||||
| MS_LOG(EXCEPTION) << "Memcpy_s report data tag failed"; | MS_LOG(EXCEPTION) << "Memcpy_s report data tag failed"; | ||||
| } | } | ||||
| auto report_ret = reporter->Report(&report_data); | |||||
| auto report_ret = ProfilingManager::GetInstance().CallMsprofReport(NOT_NULL(&report_data)); | |||||
| if (report_ret != 0) { | if (report_ret != 0) { | ||||
| MS_LOG(EXCEPTION) << "Report data failed"; | MS_LOG(EXCEPTION) << "Report data failed"; | ||||
| } | } | ||||
| @@ -515,6 +515,10 @@ CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr<session::K | |||||
| selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); | selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); | ||||
| MS_EXCEPTION_IF_NULL(switch_loop_input.at(kCurLoopCountParamName)); | MS_EXCEPTION_IF_NULL(switch_loop_input.at(kCurLoopCountParamName)); | ||||
| assign_add_one->set_abstract(switch_loop_input.at(kCurLoopCountParamName)->abstract()); | assign_add_one->set_abstract(switch_loop_input.at(kCurLoopCountParamName)->abstract()); | ||||
| // add AssignAdd op to kernel ref node map | |||||
| session::AnfWithOutIndex final_pair = std::make_pair(assign_add_one, 0); | |||||
| session::KernelWithIndex kernel_with_index = AnfAlgo::VisitKernel(AnfAlgo::GetInputNode(assign_add_one, 0), 0); | |||||
| kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index); | |||||
| return assign_add_one; | return assign_add_one; | ||||
| } | } | ||||
| @@ -99,6 +99,8 @@ class KernelRuntime { | |||||
| #endif | #endif | ||||
| } | } | ||||
| virtual void PreInit() {} | |||||
| protected: | protected: | ||||
| virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| TypeId type_id) = 0; | TypeId type_id) = 0; | ||||
| @@ -67,26 +67,26 @@ bool CommManager::CreateGroupSync(const string &group, const vector<unsigned int | |||||
| HCCL_GROUP_CHECK_EMPTY(group); | HCCL_GROUP_CHECK_EMPTY(group); | ||||
| HCCL_GROUP_CHECK_IS_WORLD(group); | HCCL_GROUP_CHECK_IS_WORLD(group); | ||||
| HCCL_RUN_CHECK(string("create communicate group"), group, | HCCL_RUN_CHECK(string("create communicate group"), group, | ||||
| hcom_create_group(group.c_str(), UlongToUint(rank_size), vector<unsigned int>(rank_id_list).data())); | |||||
| HcomCreateGroup(group.c_str(), UlongToUint(rank_size), vector<unsigned int>(rank_id_list).data())); | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool CommManager::GetRankID(const string &group, unsigned int *rank_id) const { | bool CommManager::GetRankID(const string &group, unsigned int *rank_id) const { | ||||
| HCCL_GROUP_CHECK_EMPTY(group); | HCCL_GROUP_CHECK_EMPTY(group); | ||||
| HCCL_RUN_CHECK(string("get rank_id"), group, hcom_get_rank_id(group.c_str(), rank_id)); | |||||
| HCCL_RUN_CHECK(string("get rank_id"), group, HcomGetRankId(group.c_str(), rank_id)); | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool CommManager::GetRankSize(const string &group, unsigned int *rank_size) const { | bool CommManager::GetRankSize(const string &group, unsigned int *rank_size) const { | ||||
| HCCL_GROUP_CHECK_EMPTY(group); | HCCL_GROUP_CHECK_EMPTY(group); | ||||
| HCCL_RUN_CHECK(string("get rank size"), group, hcom_get_rank_size(group.c_str(), rank_size)); | |||||
| HCCL_RUN_CHECK(string("get rank size"), group, HcomGetRankSize(group.c_str(), rank_size)); | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool CommManager::DestroyGroup(const string &group) const { | bool CommManager::DestroyGroup(const string &group) const { | ||||
| HCCL_GROUP_CHECK_EMPTY(group); | HCCL_GROUP_CHECK_EMPTY(group); | ||||
| HCCL_GROUP_CHECK_IS_WORLD(group); | HCCL_GROUP_CHECK_IS_WORLD(group); | ||||
| HCCL_RUN_CHECK(string("destroy communicate group"), group, hcom_destroy_group(group.c_str())); | |||||
| HCCL_RUN_CHECK(string("destroy communicate group"), group, HcomDestroyGroup(group.c_str())); | |||||
| return true; | return true; | ||||
| } | } | ||||
| #elif defined(ENABLE_GPU) | #elif defined(ENABLE_GPU) | ||||
| @@ -110,7 +110,7 @@ def create_group(group, rank_num, rank_ids): | |||||
| c_array_rank_ids = c_array(ctypes.c_uint, rank_ids) | c_array_rank_ids = c_array(ctypes.c_uint, rank_ids) | ||||
| c_rank_num = ctypes.c_uint(rank_num) | c_rank_num = ctypes.c_uint(rank_num) | ||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| ret = HCCL_LIB_CTYPES.hcom_create_group(c_group, c_rank_num, c_array_rank_ids) | |||||
| ret = HCCL_LIB_CTYPES.HcomCreateGroup(c_group, c_rank_num, c_array_rank_ids) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Create group error.') | raise RuntimeError('Create group error.') | ||||
| else: | else: | ||||
| @@ -129,7 +129,7 @@ def destroy_group(group): | |||||
| """ | """ | ||||
| check_group(group) | check_group(group) | ||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| ret = HCCL_LIB_CTYPES.hcom_destroy_group(c_group) | |||||
| ret = HCCL_LIB_CTYPES.HcomDestroyGroup(c_group) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Destroy group error.') | raise RuntimeError('Destroy group error.') | ||||
| @@ -147,7 +147,7 @@ def get_rank_size(group="hccl_world_group"): | |||||
| check_group(group) | check_group(group) | ||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| c_rank_size = ctypes.c_uint() | c_rank_size = ctypes.c_uint() | ||||
| ret = HCCL_LIB_CTYPES.hcom_get_rank_size(c_group, ctypes.byref(c_rank_size)) | |||||
| ret = HCCL_LIB_CTYPES.HcomGetRankSize(c_group, ctypes.byref(c_rank_size)) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Get rank size error.') | raise RuntimeError('Get rank size error.') | ||||
| @@ -164,7 +164,7 @@ def get_rank_id(group="hccl_world_group"): | |||||
| check_group(group) | check_group(group) | ||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| c_rank_id = ctypes.c_uint() | c_rank_id = ctypes.c_uint() | ||||
| ret = HCCL_LIB_CTYPES.hcom_get_rank_id(c_group, ctypes.byref(c_rank_id)) | |||||
| ret = HCCL_LIB_CTYPES.HcomGetRankId(c_group, ctypes.byref(c_rank_id)) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Get rank id error.') | raise RuntimeError('Get rank id error.') | ||||
| @@ -184,7 +184,7 @@ def get_local_rank_size(group="hccl_world_group"): | |||||
| check_group(group) | check_group(group) | ||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| c_local_rank_size = ctypes.c_uint() | c_local_rank_size = ctypes.c_uint() | ||||
| ret = HCCL_LIB_CTYPES.hcom_get_local_rank_size(c_group, ctypes.byref(c_local_rank_size)) | |||||
| ret = HCCL_LIB_CTYPES.HcomGetLocalRankSize(c_group, ctypes.byref(c_local_rank_size)) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Get local rank size error.') | raise RuntimeError('Get local rank size error.') | ||||
| @@ -203,7 +203,7 @@ def get_local_rank_id(group="hccl_world_group"): | |||||
| check_group(group) | check_group(group) | ||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| c_local_rank_id = ctypes.c_uint() | c_local_rank_id = ctypes.c_uint() | ||||
| ret = HCCL_LIB_CTYPES.hcom_get_local_rank_id(c_group, ctypes.byref(c_local_rank_id)) | |||||
| ret = HCCL_LIB_CTYPES.HcomGetLocalRankId(c_group, ctypes.byref(c_local_rank_id)) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Get local rank id error.') | raise RuntimeError('Get local rank id error.') | ||||
| @@ -225,7 +225,7 @@ def get_world_rank_from_group_rank(group, group_rank_id): | |||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| c_group_rank_id = ctypes.c_uint(group_rank_id) | c_group_rank_id = ctypes.c_uint(group_rank_id) | ||||
| c_world_rank_id = ctypes.c_uint() | c_world_rank_id = ctypes.c_uint() | ||||
| ret = HCCL_LIB_CTYPES.hcom_get_world_rank_from_group_rank(c_group, c_group_rank_id, ctypes.byref(c_world_rank_id)) | |||||
| ret = HCCL_LIB_CTYPES.HcomGetWorldRankFromGroupRank(c_group, c_group_rank_id, ctypes.byref(c_world_rank_id)) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Get world rank from group rank error.') | raise RuntimeError('Get world rank from group rank error.') | ||||
| @@ -247,7 +247,7 @@ def get_group_rank_from_world_rank(world_rank_id, group): | |||||
| c_group = c_str(group) | c_group = c_str(group) | ||||
| c_world_rank_id = ctypes.c_uint(world_rank_id) | c_world_rank_id = ctypes.c_uint(world_rank_id) | ||||
| c_group_rank_id = ctypes.c_uint() | c_group_rank_id = ctypes.c_uint() | ||||
| ret = HCCL_LIB_CTYPES.hcom_get_group_rank_from_world_rank(c_world_rank_id, c_group, ctypes.byref(c_group_rank_id)) | |||||
| ret = HCCL_LIB_CTYPES.HcomGetGroupRankFromWorldRank(c_world_rank_id, c_group, ctypes.byref(c_group_rank_id)) | |||||
| if ret != 0: | if ret != 0: | ||||
| raise RuntimeError('Get group rank from world rank error.') | raise RuntimeError('Get group rank from world rank error.') | ||||
| @@ -164,10 +164,11 @@ def test_transformer(): | |||||
| # assertion occurs while the loss value, overflow state or loss_scale value is wrong | # assertion occurs while the loss value, overflow state or loss_scale value is wrong | ||||
| loss_value = np.array(callback.loss_list) | loss_value = np.array(callback.loss_list) | ||||
| assert np.allclose(loss_value[0], 11.241604, 0, 0.000005) | |||||
| assert np.allclose(loss_value[0], 11.241624, 0, 0.000005) | |||||
| expect_loss_value = [11.241624, 11.243232, 11.217465, 11.204196, 11.2138195, | |||||
| 11.215386, 11.19053, 11.150403, 11.191858, 11.160057] | |||||
| expect_loss_value = [11.241604, 11.243231, 11.217458, 11.204156, 11.213805, | |||||
| 11.215374, 11.19065, 11.150393, 11.191824, 11.160044] | |||||
| print("loss value: {}".format(loss_value)) | print("loss value: {}".format(loss_value)) | ||||
| assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005) | assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005) | ||||
| @@ -85,8 +85,6 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} | |||||
| # dont remove the 4 lines above | # dont remove the 4 lines above | ||||
| "../../../mindspore/ccsrc/debug/data_dump/dump_json_parser.cc" | "../../../mindspore/ccsrc/debug/data_dump/dump_json_parser.cc" | ||||
| "../../../mindspore/ccsrc/debug/common.cc" | "../../../mindspore/ccsrc/debug/common.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc" | |||||
| "../../../mindspore/ccsrc/runtime/device/ascend/profiling/profiling_engine_impl.cc" | |||||
| "../../../mindspore/ccsrc/runtime/device/kernel_runtime.cc" | "../../../mindspore/ccsrc/runtime/device/kernel_runtime.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/memory_manager.cc" | "../../../mindspore/ccsrc/runtime/device/memory_manager.cc" | ||||
| "../../../mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc" | "../../../mindspore/ccsrc/runtime/device/kernel_runtime_manager.cc" | ||||
| @@ -1,124 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <iostream> | |||||
| #include <memory> | |||||
| #include "./prof_reporter.h" | |||||
| #include "common/common_test.h" | |||||
| #include "runtime/device/ascend/profiling/profiling_manager.h" | |||||
| #define private public | |||||
| #include "runtime/device/ascend/profiling/plugin_impl.h" | |||||
| #undef private | |||||
| #include "runtime/device/ascend/profiling/profiling_engine_impl.h" | |||||
| namespace mindspore { | |||||
| namespace device { | |||||
| namespace ascend { | |||||
| class stubReporter : public Reporter { | |||||
| public: | |||||
| stubReporter() = default; | |||||
| ~stubReporter() = default; | |||||
| int Report(const Msprof::Engine::ReporterData *data) override; | |||||
| int Flush() override; | |||||
| }; | |||||
| int stubReporter::Report(const Msprof::Engine::ReporterData *data) { return 0; } | |||||
| int stubReporter::Flush() { return 0; } | |||||
| class TestAscendProfiling : public UT::Common { | |||||
| public: | |||||
| TestAscendProfiling() {} | |||||
| }; | |||||
| TEST_F(TestAscendProfiling, test_profiling_GetJobId) { | |||||
| auto job_id = ProfilingManager::GetInstance().GetJobId(); | |||||
| printf("get job_id:%ld\n", job_id); | |||||
| } | |||||
| int test_profiling_start() { | |||||
| (void)setenv("PROFILING_MODE", "true", 1); | |||||
| (void)setenv("PROFILING_OPTIONS", "training_trace:task_trace", 1); | |||||
| auto ret = ProfilingManager::GetInstance().StartupProfiling(0); | |||||
| (void)unsetenv("PROFILING_MODE"); | |||||
| (void)unsetenv("PROFILING_OPTIONS"); | |||||
| return ret; | |||||
| } | |||||
| TEST_F(TestAscendProfiling, test_profiling_start) { | |||||
| auto ret = test_profiling_start(); | |||||
| ASSERT_EQ(ret, true); | |||||
| } | |||||
| int test_profiling_stop() { | |||||
| (void)setenv("PROFILING_MODE", "true", 1); | |||||
| auto engine = std::make_shared<ProfilingEngineImpl>(); | |||||
| auto report = std::make_shared<stubReporter>(); | |||||
| auto plug = engine->CreatePlugin(); | |||||
| plug->Init(report.get()); | |||||
| auto ret = ProfilingManager::GetInstance().StopProfiling(); | |||||
| plug->UnInit(); | |||||
| engine->ReleasePlugin(plug); | |||||
| (void)unsetenv("PROFILING_OPTIONS"); | |||||
| return ret; | |||||
| } | |||||
| TEST_F(TestAscendProfiling, test_profiling_stop) { | |||||
| auto ret = test_profiling_stop(); | |||||
| ASSERT_EQ(ret, true); | |||||
| } | |||||
| int test_profiling_rpt() { | |||||
| (void)setenv("PROFILING_MODE", "true", 1); | |||||
| std::map<uint32_t, std::string> op_taskId_map; | |||||
| op_taskId_map[1] = "add"; | |||||
| op_taskId_map[2] = "mul"; | |||||
| auto engine = std::make_shared<ProfilingEngineImpl>(); | |||||
| auto report = std::make_shared<stubReporter>(); | |||||
| auto plug = engine->CreatePlugin(); | |||||
| plug->Init(report.get()); | |||||
| ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); | |||||
| plug->UnInit(); | |||||
| engine->ReleasePlugin(plug); | |||||
| (void)unsetenv("PROFILING_OPTIONS"); | |||||
| return 0; | |||||
| } | |||||
| TEST_F(TestAscendProfiling, test_profiling_rpt) { | |||||
| auto ret = test_profiling_rpt(); | |||||
| ASSERT_EQ(ret, false); | |||||
| } | |||||
| int test_profiling_rpt_abnormal() { | |||||
| std::map<uint32_t, std::string> op_taskId_map; | |||||
| ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); | |||||
| (void)setenv("PROFILING_MODE", "true", 1); | |||||
| ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); | |||||
| op_taskId_map[1] = "add"; | |||||
| op_taskId_map[2] = "mul"; | |||||
| ProfilingManager::GetInstance().ReportProfilingData(op_taskId_map); | |||||
| (void)unsetenv("PROFILING_OPTIONS"); | |||||
| return 0; | |||||
| } | |||||
| TEST_F(TestAscendProfiling, test_profiling_rpt_abnormal) { | |||||
| auto ret = test_profiling_rpt_abnormal(); | |||||
| ASSERT_EQ(ret, false); | |||||
| } | |||||
| } // namespace ascend | |||||
| } // namespace device | |||||
| } // namespace mindspore | |||||
| @@ -63,32 +63,32 @@ HcclResult hcom_reduce_scatter(const char *tag, void *inputPtr, void *outputPtr, | |||||
| } | } | ||||
| /* 获取group内的rank个数 */ | /* 获取group内的rank个数 */ | ||||
| HcclResult hcom_get_rank_size(const char *group, u32 *rankSize) { return HCCL_SUCCESS; } | |||||
| HcclResult HcomGetRankSize(const char *group, u32 *rankSize) { return HCCL_SUCCESS; } | |||||
| /* python获取上云场景内的rank个数 */ | /* python获取上云场景内的rank个数 */ | ||||
| HcclResult hcom_python_get_rank_size(u32 *rankSize) { return HCCL_SUCCESS; } | HcclResult hcom_python_get_rank_size(u32 *rankSize) { return HCCL_SUCCESS; } | ||||
| /* 获取本rank的id */ | /* 获取本rank的id */ | ||||
| HcclResult hcom_get_rank_id(const char *group, u32 *rankId) { return HCCL_SUCCESS; } | |||||
| HcclResult HcomGetRankId(const char *group, u32 *rankId) { return HCCL_SUCCESS; } | |||||
| /* 获取本rank的id */ | /* 获取本rank的id */ | ||||
| HcclResult hcom_python_get_rank_id(u32 *rankId) { return HCCL_SUCCESS; } | HcclResult hcom_python_get_rank_id(u32 *rankId) { return HCCL_SUCCESS; } | ||||
| /* 获取本rank的id */ | /* 获取本rank的id */ | ||||
| HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank) { | |||||
| HcclResult HcomGetWorldRankFromGroupRank(const char *group, u32 groupRank, u32 *worldRank) { | |||||
| return HCCL_SUCCESS; | return HCCL_SUCCESS; | ||||
| } | } | ||||
| /* 获取通信域的rank个数 */ | /* 获取通信域的rank个数 */ | ||||
| HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank) { | |||||
| HcclResult HcomGetGroupRankFromWorldRank(u32 worldRank, const char *group, u32 *groupRank) { | |||||
| return HCCL_SUCCESS; | return HCCL_SUCCESS; | ||||
| } | } | ||||
| /* 创建group */ | /* 创建group */ | ||||
| HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds) { return HCCL_SUCCESS; } | |||||
| HcclResult HcomCreateGroup(const char *group, u32 rankNum, u32 *rankIds) { return HCCL_SUCCESS; } | |||||
| /* 销毁group */ | /* 销毁group */ | ||||
| HcclResult hcom_destroy_group(const char *group) { return HCCL_SUCCESS; } | |||||
| HcclResult HcomDestroyGroup(const char *group) { return HCCL_SUCCESS; } | |||||
| /* 发送消息 */ | /* 发送消息 */ | ||||
| HcclResult hcom_send(const char *tag, void *inputPtr, u64 count, HcclDataType dataType, u32 destRank, u32 srTag, | HcclResult hcom_send(const char *tag, void *inputPtr, u64 count, HcclDataType dataType, u32 destRank, u32 srTag, | ||||
| @@ -15,7 +15,6 @@ | |||||
| */ | */ | ||||
| #include <string> | #include <string> | ||||
| #include "prof_mgr_core.h" | #include "prof_mgr_core.h" | ||||
| #include <string> | |||||
| namespace Msprof { | namespace Msprof { | ||||
| namespace Engine { | namespace Engine { | ||||
| @@ -51,3 +50,7 @@ void* ProfMgrStartUp(const ProfMgrCfg* cfg) { return const_cast<void*>(reinterpr | |||||
| * PROFILING_FAILED -1 (failed) | * PROFILING_FAILED -1 (failed) | ||||
| */ | */ | ||||
| int ProfMgrStop(void* handle) { return 0; } | int ProfMgrStop(void* handle) { return 0; } | ||||
| namespace Analysis::Dvvp::ProfilerCommon { | |||||
| uint32_t MsprofilerInit() { return 0; } | |||||
| } | |||||
| @@ -151,4 +151,12 @@ int AdxDataDumpServerUnInit() { return 0; } | |||||
| RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskid, uint32_t *streamid) { return RT_ERROR_NONE; } | RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskid, uint32_t *streamid) { return RT_ERROR_NONE; } | ||||
| RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; } | |||||
| RTS_API rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback) {return RT_ERROR_NONE; } | |||||
| RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCallback callback) {return RT_ERROR_NONE; } | |||||
| RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback) {return RT_ERROR_NONE; } | |||||
| RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallbackByModule callback) { | |||||
| return RT_ERROR_NONE; | |||||
| } | |||||