Merge pull request !2102 from yanghaoran/r1.6pull/2103/head
| @@ -22,6 +22,8 @@ | |||
| #include <mutex> | |||
| #include <unordered_map> | |||
| #include "graph/profiler.h" | |||
| #include "external/ge/ge_api_types.h" | |||
| #include "toolchain/prof_callback.h" | |||
| namespace ge { | |||
| namespace profiling { | |||
| enum { | |||
| @@ -46,6 +48,7 @@ enum { | |||
| kCopyH2D, | |||
| kProfilingIndexEnd | |||
| }; | |||
| constexpr uint64_t kInvalidHashId = 0ULL; | |||
| class ProfilingContext { | |||
| public: | |||
| @@ -100,9 +103,16 @@ class ProfilingContext { | |||
| } | |||
| int64_t RegisterString(const std::string &str); | |||
| int64_t RegisterStringHash(const uint64_t hash_id, const std::string &str); | |||
| void UpdateElementHashId(MsprofReporterCallback reporter_callback); | |||
| static Status QueryHashId(const MsprofReporterCallback reporter_callback, const std::string &src_str, | |||
| uint64_t &hash_id); | |||
| size_t GetRegisterStringNum() const { | |||
| return strings_to_index_.size(); | |||
| } | |||
| private: | |||
| void RegisterString(int64_t index, const std::string &str); | |||
| void UpdateHashByStr(const std::string &str, const uint64_t hash); | |||
| void Init(); | |||
| private: | |||
| @@ -0,0 +1,35 @@ | |||
| /** | |||
| * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef INC_FRAMEWORK_OMG_MODEL_TOOL_H_ | |||
| #define INC_FRAMEWORK_OMG_MODEL_TOOL_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include "framework/common/debug/ge_log.h" | |||
| #include "proto/ge_ir.pb.h" | |||
| namespace ge { | |||
| class GE_FUNC_VISIBILITY ModelTool { | |||
| public: | |||
| static Status GetModelInfoFromOm(const char *model_file, ge::proto::ModelDef &model_def, uint32_t &modeldef_size); | |||
| static Status GetModelInfoFromPbtxt(const char *model_file, ge::proto::ModelDef &model_def); | |||
| }; | |||
| } // namespace ge | |||
| #endif // INC_FRAMEWORK_OMG_MODEL_TOOL_H_ | |||
| @@ -1 +1 @@ | |||
| Subproject commit 2659f49dcb14c0773e10e17ee9896b7be4d8e7be | |||
| Subproject commit dc5ac26aac4c49b4e72cd91d4e6d6a57bbe03af4 | |||
| @@ -145,9 +145,9 @@ struct ResultSummary { | |||
| #pragma pack(push, 1) | |||
| struct AsyncWait { | |||
| uint8_t waitType; // wait type, FWk_ADPT_WAIT_TPYE_EVENT: event wait | |||
| uint32_t waitId; // wait id, GE refresh | |||
| uint32_t timeOut; // reserved | |||
| uint8_t waitType; // wait type, FWK_ADPT_WAIT_TYPE_EVENT: event wait | |||
| uint32_t waitId; // wait id, GE refresh | |||
| uint32_t timeOut; // reserved | |||
| uint64_t reserved; | |||
| }; | |||
| #pragma pack(pop) | |||
| @@ -79,9 +79,6 @@ typedef long LONG; | |||
| #define MMPA_THREAD_SCHED_OTHER SCHED_OTHER | |||
| #define MMPA_THREAD_MIN_STACK_SIZE PTHREAD_STACK_MIN | |||
| #define MMPA_PATH_SEPARATOR_STR "/" | |||
| #define MMPA_PATH_SEPARATOR_CHAR '/' | |||
| #define MM_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER | |||
| #define MMPA_MAX_NI 19 | |||
| @@ -1,86 +1,83 @@ | |||
| /** | |||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MMPA_TYPEDEF_WIN_H | |||
| #define MMPA_TYPEDEF_WIN_H | |||
| #ifdef __cplusplus | |||
| #if __cplusplus | |||
| extern "C" { | |||
| #endif // __cpluscplus | |||
| #endif // __cpluscplus | |||
| #ifndef FALSE | |||
| #define FALSE 0 | |||
| #endif | |||
| #ifndef TRUE | |||
| #define TRUE 1 | |||
| #endif | |||
| #define EN_OK 0 | |||
| #define EN_ERR 1 | |||
| #define EN_ERROR (-1) | |||
| #define EN_INVALID_PARAM (-2) | |||
| #define EN_TIMEOUT (-3) | |||
| #define HANDLE_INVALID_VALUE (-1) | |||
| #define INVALID_SOCKET_HANDLE INVALID_SOCKET | |||
| #define MMPA_MEM_MAX_LEN (0x7fffffff) | |||
| #define MMPA_PROCESS_ERROR (0x7fffffff) | |||
| #define MMPA_ONE_THOUSAND 1000 | |||
| #define MMPA_COMPUTER_BEGIN_YEAR 1900 | |||
| #define SUMMER_TIME_OR_NOT (-1) | |||
| #define MMPA_ZERO 0 | |||
| #define MMPA_VALUE_ONE 1 | |||
| #define MMPA_SOCKET_MAIN_EDITION 2 | |||
| #define MMPA_SOCKET_SECOND_EDITION 0 | |||
| #define MMPA_PIPE_BUF_SIZE 1024 | |||
| #define MMPA_MAX_SCANDIR_COUNT 1024 | |||
| #define MAX_IOVEC_SIZE 32 | |||
| #define MMPA_PIPE_COUNT 2 | |||
| #define MMPA_THREADNAME_SIZE 16 | |||
| #define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1) | |||
| #define MMPA_MIN_OS_VERSION_SIZE 64 | |||
| #define MMPA_MAX_NI 19 | |||
| #define MMPA_MIDDLE_NI 5 | |||
| #define MMPA_LOW_NI (-5) | |||
| #define MMPA_MIN_NI (-20) | |||
| #define MMPA_MAX_FILE 128 | |||
| #define MMPA_PATH_SEPARATOR_STR "\\" | |||
| #define MMPA_PATH_SEPARATOR_CHAR '\\' | |||
| #define MMPA_MAX_THREAD_PIO 99 | |||
| #define MMPA_MIDDLE_THREAD_PIO 66 | |||
| #define MMPA_LOW_THREAD_PIO 33 | |||
| #define MMPA_MIN_THREAD_PIO 1 | |||
| #define MMPA_THREAD_SCHED_RR 0 | |||
| #define MMPA_THREAD_SCHED_FIFO 0 | |||
| #define MMPA_THREAD_SCHED_OTHER 0 | |||
| #define MMPA_THREAD_MIN_STACK_SIZE 0 | |||
| #define MM_MUTEX_INITIALIZER NULL | |||
| #ifdef __cplusplus | |||
| #if __cplusplus | |||
| } | |||
| #endif // __cpluscplus | |||
| #endif // __cpluscplus | |||
| #endif // _MMPA_TYPEDEF_WIN_H_ | |||
| /** | |||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MMPA_TYPEDEF_WIN_H | |||
| #define MMPA_TYPEDEF_WIN_H | |||
| #ifdef __cplusplus | |||
| #if __cplusplus | |||
| extern "C" { | |||
| #endif // __cpluscplus | |||
| #endif // __cpluscplus | |||
| #ifndef FALSE | |||
| #define FALSE 0 | |||
| #endif | |||
| #ifndef TRUE | |||
| #define TRUE 1 | |||
| #endif | |||
| #define EN_OK 0 | |||
| #define EN_ERR 1 | |||
| #define EN_ERROR (-1) | |||
| #define EN_INVALID_PARAM (-2) | |||
| #define EN_TIMEOUT (-3) | |||
| #define HANDLE_INVALID_VALUE (-1) | |||
| #define INVALID_SOCKET_HANDLE INVALID_SOCKET | |||
| #define MMPA_MEM_MAX_LEN (0x7fffffff) | |||
| #define MMPA_PROCESS_ERROR (0x7fffffff) | |||
| #define MMPA_ONE_THOUSAND 1000 | |||
| #define MMPA_COMPUTER_BEGIN_YEAR 1900 | |||
| #define SUMMER_TIME_OR_NOT (-1) | |||
| #define MMPA_ZERO 0 | |||
| #define MMPA_VALUE_ONE 1 | |||
| #define MMPA_SOCKET_MAIN_EDITION 2 | |||
| #define MMPA_SOCKET_SECOND_EDITION 0 | |||
| #define MMPA_PIPE_BUF_SIZE 1024 | |||
| #define MMPA_MAX_SCANDIR_COUNT 1024 | |||
| #define MAX_IOVEC_SIZE 32 | |||
| #define MMPA_PIPE_COUNT 2 | |||
| #define MMPA_THREADNAME_SIZE 16 | |||
| #define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1) | |||
| #define MMPA_MIN_OS_VERSION_SIZE 64 | |||
| #define MMPA_MAX_NI 19 | |||
| #define MMPA_MIDDLE_NI 5 | |||
| #define MMPA_LOW_NI (-5) | |||
| #define MMPA_MIN_NI (-20) | |||
| #define MMPA_MAX_FILE 128 | |||
| #define MMPA_MAX_THREAD_PIO 99 | |||
| #define MMPA_MIDDLE_THREAD_PIO 66 | |||
| #define MMPA_LOW_THREAD_PIO 33 | |||
| #define MMPA_MIN_THREAD_PIO 1 | |||
| #define MMPA_THREAD_SCHED_RR 0 | |||
| #define MMPA_THREAD_SCHED_FIFO 0 | |||
| #define MMPA_THREAD_SCHED_OTHER 0 | |||
| #define MMPA_THREAD_MIN_STACK_SIZE 0 | |||
| #define MM_MUTEX_INITIALIZER NULL | |||
| #ifdef __cplusplus | |||
| #if __cplusplus | |||
| } | |||
| #endif // __cpluscplus | |||
| #endif // __cpluscplus | |||
| #endif // _MMPA_TYPEDEF_WIN_H_ | |||
| @@ -142,6 +142,74 @@ REG_OP(BatchNorm) | |||
| .ATTR(is_training, Bool, true) | |||
| .OP_END_FACTORY_REG(BatchNorm) | |||
| /** | |||
| * @brief After the mean and reciprocal of standard deviation(invert_std) are separately calculated on each device, | |||
| * the mena and reciprocal of standard deviation(invert_std) data on each device are normlized, | |||
| * a total mean and reciprocal of standard deviation(invert_std) are returned, and running_var are updated. | |||
| * @par Inputs: | |||
| * include: | |||
| * @li mean_all: A Tensor. The mean of each device. Must be one of the following types: float16, float32. | |||
| * @li invert_std_all: A Tensor. Reciprocal of the variances of each device. Must be one of the following types: float16, float32. | |||
| * @li count_all: A Tensor. Number of data for each device. Must be one of the following types: float16, float32. | |||
| * @li mean_broadcast: A Tensor. The overall average and broadcast. Must be one of the following types: float16, float32. | |||
| * @li count_sum: A Tensor. General statistics. Must be one of the following types: float16, float32. | |||
| * @li running_var: A Tensor. Runtime variance. Must be one of the following types: float16, float32. \n | |||
| * @par Attributes: | |||
| * Two Attributes, including: | |||
| * @li momentum: A optional float. Defaults to 0.01. \n | |||
| * @li epsilon: An optional float. Defaults to 0.00001. \n | |||
| * @par Outputs: | |||
| * include: | |||
| * @li invert_std: A Tensor. It's inverse of total variance. | |||
| * @li running_var_update: A Tensor. It's moving variance of each device after the update. \n | |||
| * @par Third-party framework compatibility | |||
| * ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate | |||
| * compatible with the Pytorch operator BatchNormGatherStatsWithCounts. | |||
| */ | |||
| REG_OP(SyncBatchNormGatherStatsWithCounts) | |||
| .INPUT(mean_all, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(invert_std_all, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(count_all, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(mean_broadcast, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(running_var, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .OUTPUT(invert_std, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .OUTPUT(running_var_update, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .ATTR(momentum, Float, 0.1) | |||
| .ATTR(epsilon, Float, 0.001) | |||
| .OP_END_FACTORY_REG(SyncBatchNormGatherStatsWithCounts) | |||
| /** | |||
| * @brief update running_mean. | |||
| * @par Inputs: | |||
| * include: | |||
| * @li mean: A Tensor. The mean of each device. Must be one of the following types: float16, float32. | |||
| * @li running_mean: A Tensor. Runtime Mean. Must be one of the following types: float16, float32. \n | |||
| * @par Attributes: | |||
| * One Attribute, including: | |||
| * @li momentum: A optional float. Defaults to 0.01. \n | |||
| * @par Outputs: | |||
| * include: | |||
| * @li running_mean_update: A Tensor. It's moving mean of each device after the update. \n | |||
| * @par Third-party framework compatibility | |||
| * ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate | |||
| * compatible with the Pytorch operator BatchNormGatherStatsWithCounts. | |||
| */ | |||
| REG_OP(SyncBNTrainingUpdate) | |||
| .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(running_mean, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .OUTPUT(running_mean_update, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .ATTR(momentum, Float, 0.1) | |||
| .OP_END_FACTORY_REG(SyncBNTrainingUpdate) | |||
| /** | |||
| *@brief part of SyncBatchNormBackward . \n | |||
| @@ -515,6 +515,34 @@ REG_OP(ReduceSumD) | |||
| .ATTR(keep_dims, Bool, false) | |||
| .OP_END_FACTORY_REG(ReduceSumD) | |||
| /** | |||
| *@brief Calculate the total mean based on the mean of each device . \n | |||
| *@par Inputs: | |||
| * Three inputs, including: | |||
| *@li x: A Tensor. Must be one of the following types: float16, float32 . | |||
| *@li count: A Tensor. Must be one of the following types: float16, float32 . | |||
| *@li count_sum: A Tensor. Must be one of the following types: float16, float32 . \n | |||
| *@par Attributes: | |||
| *@li axes: A required 1D list or tuple of int32 or int64. Specifies the dimensions to reduce. | |||
| *@li keepdims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n | |||
| *@par Outputs: | |||
| *y: The reduced tensor. Has the same type and format as input "x" . \n | |||
| *@par Third-party framework compatibility | |||
| * Compatible with the TensorFlow operator Sum. | |||
| */ | |||
| REG_OP(ReduceMeanWithCount) | |||
| .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(count, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) | |||
| .REQUIRED_ATTR(axes, ListInt) | |||
| .ATTR(keep_dims, Bool, false) | |||
| .OP_END_FACTORY_REG(ReduceMeanWithCount) | |||
| /** | |||
| *@brief Calculates the "logical sum" of elements of a tensor in a dimension . \n | |||
| @@ -1363,6 +1391,64 @@ REG_OP(ReduceStdV2Update) | |||
| .ATTR(unbiased, Bool, true) | |||
| .ATTR(keepdim, Bool, false) | |||
| .OP_END_FACTORY_REG(ReduceStdV2Update) | |||
| /** | |||
| *@brief Computes the log and sum and exp of elements across dimensions of a tensor. | |||
| * Reduces "x" along the dimensions given in "axes". | |||
| * Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each | |||
| * entry in "axes". If "keep_dims" is true, the reduced dimensions | |||
| * are retained with length 1. | |||
| * | |||
| *@par Inputs: | |||
| * Two inputs, including: | |||
| *@li x: A Tensor. Must be one of the following types: | |||
| * float32, float16, int32, int64, uint32, uint64, double | |||
| *@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n | |||
| * | |||
| *@par Attributes: | |||
| *keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n | |||
| * | |||
| *@par Outputs: | |||
| *y: The reduced tensor. Has the same type and format as input "x" . \n | |||
| * | |||
| *@par Third-party framework compatibility | |||
| * Compatible with the Onnx operator ReduceLogSumExp. | |||
| */ | |||
| REG_OP(ReduceLogSumExp) | |||
| .INPUT(x, TensorType::NumberType()) | |||
| .INPUT(axes, TensorType::IndexNumberType()) | |||
| .OUTPUT(y, TensorType::NumberType()) | |||
| .ATTR(keep_dims, Bool, false) | |||
| .OP_END_FACTORY_REG(ReduceLogSumExp) | |||
| /** | |||
| *@brief Computes the log and sum of elements across dimensions of a tensor. | |||
| * Reduces "x" along the dimensions given in "axes". | |||
| * Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each | |||
| * entry in "axes". If "keep_dims" is true, the reduced dimensions | |||
| * are retained with length 1. | |||
| * | |||
| *@par Inputs: | |||
| * Two inputs, including: | |||
| *@li x: A Tensor. Must be one of the following types: | |||
| * float32, float16, int32, int64, uint32, uint64, double | |||
| *@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n | |||
| * | |||
| *@par Attributes: | |||
| *keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n | |||
| * | |||
| *@par Outputs: | |||
| *y: The reduced tensor. Has the same type and format as input "x" . \n | |||
| * | |||
| *@par Third-party framework compatibility | |||
| * Compatible with the Onnx operator ReduceLogSum. | |||
| */ | |||
| REG_OP(ReduceLogSum) | |||
| .INPUT(x, TensorType::NumberType()) | |||
| .INPUT(axes, TensorType::IndexNumberType()) | |||
| .OUTPUT(y, TensorType::NumberType()) | |||
| .ATTR(keep_dims, Bool, false) | |||
| .OP_END_FACTORY_REG(ReduceLogSum) | |||
| } //namespace ge | |||
| #endif // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_ | |||
| @@ -1,8 +1,17 @@ | |||
| /* | |||
| * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. | |||
| * Description: handle perf data | |||
| * Author: xp | |||
| * Create: 2019-10-13 | |||
| /** | |||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MSPROFILER_PROF_CALLBACK_H_ | |||
| @@ -0,0 +1,450 @@ | |||
| /* | |||
| * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. | |||
| * Description: handle perf data | |||
| * Author: Huawei Technologies Co., Ltd. | |||
| * Create: 2019-10-13 | |||
| */ | |||
| #ifndef MSPROFILER_PROF_COMMON_H_ | |||
| #define MSPROFILER_PROF_COMMON_H_ | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| #endif // __cplusplus | |||
| #include <stdint.h> | |||
| #define MSPROF_DATA_HEAD_MAGIC_NUM 0x5a5a | |||
| enum MsprofDataTag { | |||
| MSPROF_ACL_DATA_TAG = 0, //acl data tag, range: 0~19 | |||
| MSPROF_GE_DATA_TAG_MODEL_LOAD = 20, //ge data tag, range: 20~39 | |||
| MSPROF_GE_DATA_TAG_FUSION = 21, | |||
| MSPROF_GE_DATA_TAG_INFER = 22, | |||
| MSPROF_GE_DATA_TAG_TASK = 23, | |||
| MSPROF_GE_DATA_TAG_TENSOR = 24, | |||
| MSPROF_GE_DATA_TAG_STEP = 25, | |||
| MSPROF_GE_DATA_TAG_ID_MAP = 26, | |||
| MSPROF_GE_DATA_TAG_HOST_SCH = 27, | |||
| MSPROF_RUNTIME_DATA_TAG_API = 40, //runtime data tag, range: 40~59 | |||
| MSPROF_RUNTIME_DATA_TAG_TRACK = 41, | |||
| MSPROF_AICPU_DATA_TAG = 60, //aicpu data tag, range: 60~79 | |||
| MSPROF_HCCL_DATA_TAG = 80, //hccl data tag, range: 80~99 | |||
| MSPROF_DP_DATA_TAG = 100, //dp data tag, range: 100~119 | |||
| MSPROF_MSPROFTX_DATA_TAG = 120, //hccl data tag, range: 120~139 | |||
| MSPROF_DATA_TAG_MAX = 65536, //data tag value type is uint16_t | |||
| }; | |||
| /** | |||
| * @brief struct of mixed data | |||
| */ | |||
| #define MSPROF_MIX_DATA_RESERVE_BYTES 7 | |||
| #define MSPROF_MIX_DATA_STRING_LEN 120 | |||
| enum MsprofMixDataType { | |||
| MSPROF_MIX_DATA_HASH_ID = 0, | |||
| MSPROF_MIX_DATA_STRING, | |||
| }; | |||
| struct MsprofMixData { | |||
| uint8_t type; // MsprofMixDataType | |||
| uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES]; | |||
| union { | |||
| uint64_t hashId; | |||
| char dataStr[MSPROF_MIX_DATA_STRING_LEN]; | |||
| } data; | |||
| }; | |||
| using MixData = struct MsprofMixData; | |||
| /** | |||
| * @brief profiling command info | |||
| */ | |||
| #define MSPROF_MAX_DEV_NUM 64 | |||
| struct MsprofCommandHandle { | |||
| uint64_t profSwitch; | |||
| uint64_t profSwitchHi; | |||
| uint32_t devNums; | |||
| uint32_t devIdList[MSPROF_MAX_DEV_NUM]; | |||
| uint32_t modelId; | |||
| uint32_t type; | |||
| }; | |||
| /** | |||
| * @brief struct of data reported by acl | |||
| */ | |||
| #define MSPROF_ACL_DATA_RESERVE_BYTES 32 | |||
| #define MSPROF_ACL_API_NAME_LEN 64 | |||
| enum MsprofAclApiType { | |||
| MSPROF_ACL_API_TYPE_OP = 1, | |||
| MSPROF_ACL_API_TYPE_MODEL, | |||
| MSPROF_ACL_API_TYPE_RUNTIME, | |||
| MSPROF_ACL_API_TYPE_OTHERS, | |||
| }; | |||
| struct MsprofAclProfData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_ACL_DATA_TAG; | |||
| uint32_t apiType; // enum MsprofAclApiType | |||
| uint64_t beginTime; | |||
| uint64_t endTime; | |||
| uint32_t processId; | |||
| uint32_t threadId; | |||
| char apiName[MSPROF_ACL_API_NAME_LEN]; | |||
| uint8_t reserve[MSPROF_ACL_DATA_RESERVE_BYTES]; | |||
| }; | |||
| /** | |||
| * @brief struct of data reported by GE | |||
| */ | |||
| #define MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES 104 | |||
| struct MsprofGeProfModelLoadData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_MODEL_LOAD; | |||
| uint32_t modelId; | |||
| MixData modelName; | |||
| uint64_t startTime; | |||
| uint64_t endTime; | |||
| uint8_t reserve[MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_FUSION_DATA_RESERVE_BYTES 8 | |||
| #define MSPROF_GE_FUSION_OP_NUM 8 | |||
| struct MsprofGeProfFusionData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_FUSION; | |||
| uint32_t modelId; | |||
| MixData fusionName; | |||
| uint64_t inputMemSize; | |||
| uint64_t outputMemSize; | |||
| uint64_t weightMemSize; | |||
| uint64_t workspaceMemSize; | |||
| uint64_t totalMemSize; | |||
| uint64_t fusionOpNum; | |||
| uint64_t fusionOp[MSPROF_GE_FUSION_OP_NUM]; | |||
| uint8_t reserve[MSPROF_GE_FUSION_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_INFER_DATA_RESERVE_BYTES 64 | |||
| struct MsprofGeProfInferData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_INFER; | |||
| uint32_t modelId; | |||
| MixData modelName; | |||
| uint32_t requestId; | |||
| uint32_t threadId; | |||
| uint64_t inputDataStartTime; | |||
| uint64_t inputDataEndTime; | |||
| uint64_t inferStartTime; | |||
| uint64_t inferEndTime; | |||
| uint64_t outputDataStartTime; | |||
| uint64_t outputDataEndTime; | |||
| uint8_t reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_TASK_DATA_RESERVE_BYTES 16 | |||
| #define MSPROF_GE_OP_TYPE_LEN 56 | |||
| enum MsprofGeTaskType { | |||
| MSPROF_GE_TASK_TYPE_AI_CORE = 0, | |||
| MSPROF_GE_TASK_TYPE_AI_CPU, | |||
| MSPROF_GE_TASK_TYPE_AIV, | |||
| }; | |||
| enum MsprofGeShapeType { | |||
| MSPROF_GE_SHAPE_TYPE_STATIC = 0, | |||
| MSPROF_GE_SHAPE_TYPE_DYNAMIC, | |||
| }; | |||
| struct MsprofGeOpType { | |||
| uint8_t type; // MsprofMixDataType | |||
| uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES]; | |||
| union { | |||
| uint64_t hashId; | |||
| char dataStr[MSPROF_GE_OP_TYPE_LEN]; | |||
| } data; | |||
| }; | |||
| struct MsprofGeProfTaskData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_TASK; | |||
| uint32_t taskType; // MsprofGeTaskType | |||
| MixData opName; | |||
| MsprofGeOpType opType; | |||
| uint64_t curIterNum; | |||
| uint64_t timeStamp; | |||
| uint32_t shapeType; // MsprofGeShapeType | |||
| uint32_t blockDims; | |||
| uint32_t modelId; | |||
| uint32_t streamId; | |||
| uint32_t taskId; | |||
| uint32_t threadId; | |||
| uint8_t reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_TENSOR_DATA_RESERVE_BYTES 8 | |||
| #define MSPROF_GE_TENSOR_DATA_SHAPE_LEN 8 | |||
| #define MSPROF_GE_TENSOR_DATA_NUM 5 | |||
| enum MsprofGeTensorType { | |||
| MSPROF_GE_TENSOR_TYPE_INPUT = 0, | |||
| MSPROF_GE_TENSOR_TYPE_OUTPUT, | |||
| }; | |||
| struct MsprofGeTensorData { | |||
| uint32_t tensorType; // MsprofGeTensorType | |||
| uint32_t format; | |||
| uint32_t dataType; | |||
| uint32_t shape[MSPROF_GE_TENSOR_DATA_SHAPE_LEN]; | |||
| }; | |||
| struct MsprofGeProfTensorData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_TENSOR; | |||
| uint32_t modelId; | |||
| uint64_t curIterNum; | |||
| uint32_t streamId; | |||
| uint32_t taskId; | |||
| uint32_t tensorNum; | |||
| MsprofGeTensorData tensorData[MSPROF_GE_TENSOR_DATA_NUM]; | |||
| uint8_t reserve[MSPROF_GE_TENSOR_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_STEP_DATA_RESERVE_BYTES 27 | |||
| enum MsprofGeStepTag { | |||
| MSPROF_GE_STEP_TAG_BEGIN = 0, | |||
| MSPROF_GE_STEP_TAG_END, | |||
| }; | |||
| struct MsprofGeProfStepData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_STEP; | |||
| uint32_t modelId; | |||
| uint32_t streamId; | |||
| uint32_t taskId; | |||
| uint64_t timeStamp; | |||
| uint64_t curIterNum; | |||
| uint32_t threadId; | |||
| uint8_t tag; // MsprofGeStepTag | |||
| uint8_t reserve[MSPROF_GE_STEP_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES 6 | |||
| struct MsprofGeProfIdMapData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_ID_MAP; | |||
| uint32_t graphId; | |||
| uint32_t modelId; | |||
| uint32_t sessionId; | |||
| uint64_t timeStamp; | |||
| uint16_t mode; | |||
| uint8_t reserve[MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES 24 | |||
| struct MsprofGeProfHostSchData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_HOST_SCH; | |||
| uint32_t threadId; // record in start event | |||
| uint64_t element; | |||
| uint64_t event; | |||
| uint64_t startTime; // record in start event | |||
| uint64_t endTime; // record in end event | |||
| uint8_t reserve[MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES]; | |||
| }; | |||
| /** | |||
| * @brief struct of data reported by RunTime | |||
| */ | |||
| #define MSPROF_RUNTIME_API_DATA_RESERVE_BYTES 106 | |||
| #define MSPROF_RUNTIME_TASK_ID_NUM 10 | |||
| #define MSPROF_RUNTIME_API_NAME_LEN 64 | |||
| struct MsprofRuntimeProfApiData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_API; | |||
| uint32_t threadId; | |||
| uint64_t entryTime; | |||
| uint64_t exitTime; | |||
| uint64_t dataSize; | |||
| uint8_t apiName[MSPROF_RUNTIME_API_NAME_LEN]; | |||
| uint32_t retCode; | |||
| uint32_t streamId; | |||
| uint32_t taskNum; | |||
| uint32_t taskId[MSPROF_RUNTIME_TASK_ID_NUM]; | |||
| uint16_t memcpyDirection; | |||
| uint8_t reserve[MSPROF_RUNTIME_API_DATA_RESERVE_BYTES]; | |||
| }; | |||
| #define MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES 10 | |||
| #define MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN 32 | |||
| struct MsprofRuntimeProfTrackData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_TRACK; | |||
| uint32_t threadId; | |||
| uint64_t timeStamp; | |||
| char taskType[MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN]; | |||
| uint32_t taskId; | |||
| uint16_t streamId; | |||
| uint8_t reserve[MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES]; | |||
| }; | |||
| /** | |||
| * @brief struct of data reported by RunTime | |||
| */ | |||
| #define MSPROF_AICPU_DATA_RESERVE_BYTES 9 | |||
| struct MsprofAicpuProfData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_AICPU_DATA_TAG; | |||
| uint16_t streamId; | |||
| uint16_t taskId; | |||
| uint64_t runStartTime; | |||
| uint64_t runStartTick; | |||
| uint64_t computeStartTime; | |||
| uint64_t memcpyStartTime; | |||
| uint64_t memcpyEndTime; | |||
| uint64_t runEndTime; | |||
| uint64_t runEndTick; | |||
| uint32_t threadId; | |||
| uint32_t deviceId; | |||
| uint64_t submitTick; | |||
| uint64_t scheduleTick; | |||
| uint64_t tickBeforeRun; | |||
| uint64_t tickAfterRun; | |||
| uint32_t kernelType; | |||
| uint32_t dispatchTime; | |||
| uint32_t totalTime; | |||
| uint16_t fftsThreadId; | |||
| uint8_t version; | |||
| uint8_t reserve[MSPROF_AICPU_DATA_RESERVE_BYTES]; | |||
| }; | |||
| /** | |||
| * @brief struct of data reported by DP | |||
| */ | |||
| #define MSPROF_DP_DATA_RESERVE_BYTES 16 | |||
| #define MSPROF_DP_DATA_ACTION_LEN 16 | |||
| #define MSPROF_DP_DATA_SOURCE_LEN 64 | |||
| struct MsprofDpProfData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_DP_DATA_TAG; | |||
| uint32_t rsv; // Ensure 8-byte alignment | |||
| uint64_t timeStamp; | |||
| char action[MSPROF_DP_DATA_ACTION_LEN]; | |||
| char source[MSPROF_DP_DATA_SOURCE_LEN]; | |||
| uint64_t index; | |||
| uint64_t size; | |||
| uint8_t reserve[MSPROF_DP_DATA_RESERVE_BYTES]; | |||
| }; | |||
| /** | |||
| * @brief struct of data reported by HCCL | |||
| */ | |||
| #pragma pack(4) | |||
| struct MsprofHcclProfNotify { | |||
| uint32_t taskID; | |||
| uint64_t notifyID; | |||
| uint32_t stage; | |||
| uint32_t remoteRank; | |||
| uint32_t transportType; | |||
| uint32_t role; // role {0: dst, 1:src} | |||
| double durationEstimated; | |||
| }; | |||
| struct MsprofHcclProfReduce { | |||
| uint32_t taskID; | |||
| uint64_t src; | |||
| uint64_t dst; | |||
| uint64_t size; | |||
| uint32_t op; // {0: sum, 1: mul, 2: max, 3: min} | |||
| uint32_t dataType; // data type {0: INT8, 1: INT16, 2: INT32, 3: FP16, 4:FP32, 5:INT64, 6:UINT64} | |||
| uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} | |||
| uint32_t remoteRank; | |||
| uint32_t transportType; // transport type {0: SDMA, 1: RDMA, 2:LOCAL} | |||
| uint32_t role; // role {0: dst, 1:src} | |||
| double durationEstimated; | |||
| }; | |||
| struct MsprofHcclProfRDMA { | |||
| uint32_t taskID; | |||
| uint64_t src; | |||
| uint64_t dst; | |||
| uint64_t size; | |||
| uint64_t notifyID; | |||
| uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} | |||
| uint32_t remoteRank; | |||
| uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL} | |||
| uint32_t role; // role {0: dst, 1:src} | |||
| uint32_t type; // RDMA type {0: RDMASendNotify, 1:RDMASendPayload} | |||
| double durationEstimated; | |||
| }; | |||
| struct MsprofHcclProfMemcpy { | |||
| uint32_t taskID; | |||
| uint64_t src; | |||
| uint64_t dst; | |||
| uint64_t size; | |||
| uint64_t notifyID; | |||
| uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} | |||
| uint32_t remoteRank; | |||
| uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL} | |||
| uint32_t role; // role {0: dst, 1:src} | |||
| double durationEstimated; | |||
| }; | |||
| struct MsprofHcclProfStageStep { | |||
| uint32_t rank; | |||
| uint32_t rankSize; | |||
| }; | |||
| struct MsprofHcclProfFlag { | |||
| uint64_t cclTag; | |||
| uint64_t groupName; | |||
| uint32_t localRank; | |||
| uint32_t workFlowMode; | |||
| }; | |||
| /** | |||
| * @name MsprofHcclProfData | |||
| * @brief struct of data reported by hccl | |||
| */ | |||
| struct MsprofHcclProfData { | |||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||
| uint16_t dataTag = MSPROF_HCCL_DATA_TAG; | |||
| uint32_t planeID; | |||
| uint32_t deviceID; | |||
| uint32_t streamID; | |||
| double ts; | |||
| char name[16]; | |||
| union { | |||
| MsprofHcclProfNotify notify; | |||
| MsprofHcclProfReduce reduce; | |||
| MsprofHcclProfStageStep stageStep; | |||
| MsprofHcclProfMemcpy forMemcpy; | |||
| MsprofHcclProfRDMA RDMA; | |||
| MsprofHcclProfFlag flag; | |||
| } args; | |||
| }; | |||
| #pragma pack() | |||
| /** | |||
| * @name MsprofStampInfo | |||
| * @brief struct of data reported by msproftx | |||
| */ | |||
| struct MsprofStampInfo { | |||
| uint16_t magicNumber; | |||
| uint16_t dataTag; | |||
| uint32_t processId; | |||
| uint32_t threadId; | |||
| uint32_t category; //marker category | |||
| uint32_t eventType; | |||
| int32_t payloadType; | |||
| union PayloadValue //payload info for marker | |||
| { | |||
| uint64_t ullValue; | |||
| int64_t llValue; | |||
| double dValue; | |||
| uint32_t uiValue[2]; | |||
| int32_t iValue[2]; | |||
| float fValue[2]; | |||
| } payload; | |||
| uint64_t startTime; | |||
| uint64_t endTime; | |||
| int32_t messageType; | |||
| char message[128]; | |||
| uint8_t reserve0[4]; | |||
| uint8_t reserve1[72]; | |||
| }; | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif | |||
| #endif // MSPROFILER_PROF_COMMON_H_ | |||