| @@ -22,6 +22,8 @@ | |||||
| #include <mutex> | #include <mutex> | ||||
| #include <unordered_map> | #include <unordered_map> | ||||
| #include "graph/profiler.h" | #include "graph/profiler.h" | ||||
| #include "external/ge/ge_api_types.h" | |||||
| #include "toolchain/prof_callback.h" | |||||
| namespace ge { | namespace ge { | ||||
| namespace profiling { | namespace profiling { | ||||
| enum { | enum { | ||||
| @@ -46,6 +48,7 @@ enum { | |||||
| kCopyH2D, | kCopyH2D, | ||||
| kProfilingIndexEnd | kProfilingIndexEnd | ||||
| }; | }; | ||||
| constexpr uint64_t kInvalidHashId = 0ULL; | |||||
| class ProfilingContext { | class ProfilingContext { | ||||
| public: | public: | ||||
| @@ -100,9 +103,16 @@ class ProfilingContext { | |||||
| } | } | ||||
| int64_t RegisterString(const std::string &str); | int64_t RegisterString(const std::string &str); | ||||
| int64_t RegisterStringHash(const uint64_t hash_id, const std::string &str); | |||||
| void UpdateElementHashId(MsprofReporterCallback reporter_callback); | |||||
| static Status QueryHashId(const MsprofReporterCallback reporter_callback, const std::string &src_str, | |||||
| uint64_t &hash_id); | |||||
| size_t GetRegisterStringNum() const { | |||||
| return strings_to_index_.size(); | |||||
| } | |||||
| private: | private: | ||||
| void RegisterString(int64_t index, const std::string &str); | |||||
| void UpdateHashByStr(const std::string &str, const uint64_t hash); | |||||
| void Init(); | void Init(); | ||||
| private: | private: | ||||
| @@ -0,0 +1,35 @@ | |||||
| /** | |||||
| * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef INC_FRAMEWORK_OMG_MODEL_TOOL_H_ | |||||
| #define INC_FRAMEWORK_OMG_MODEL_TOOL_H_ | |||||
| #include <memory> | |||||
| #include <string> | |||||
| #include "framework/common/debug/ge_log.h" | |||||
| #include "proto/ge_ir.pb.h" | |||||
| namespace ge { | |||||
| class GE_FUNC_VISIBILITY ModelTool { | |||||
| public: | |||||
| static Status GetModelInfoFromOm(const char *model_file, ge::proto::ModelDef &model_def, uint32_t &modeldef_size); | |||||
| static Status GetModelInfoFromPbtxt(const char *model_file, ge::proto::ModelDef &model_def); | |||||
| }; | |||||
| } // namespace ge | |||||
| #endif // INC_FRAMEWORK_OMG_MODEL_TOOL_H_ | |||||
| @@ -1 +1 @@ | |||||
| Subproject commit 2659f49dcb14c0773e10e17ee9896b7be4d8e7be | |||||
| Subproject commit dc5ac26aac4c49b4e72cd91d4e6d6a57bbe03af4 | |||||
| @@ -145,9 +145,9 @@ struct ResultSummary { | |||||
| #pragma pack(push, 1) | #pragma pack(push, 1) | ||||
| struct AsyncWait { | struct AsyncWait { | ||||
| uint8_t waitType; // wait type, FWk_ADPT_WAIT_TPYE_EVENT: event wait | |||||
| uint32_t waitId; // wait id, GE refresh | |||||
| uint32_t timeOut; // reserved | |||||
| uint8_t waitType; // wait type, FWK_ADPT_WAIT_TYPE_EVENT: event wait | |||||
| uint32_t waitId; // wait id, GE refresh | |||||
| uint32_t timeOut; // reserved | |||||
| uint64_t reserved; | uint64_t reserved; | ||||
| }; | }; | ||||
| #pragma pack(pop) | #pragma pack(pop) | ||||
| @@ -79,9 +79,6 @@ typedef long LONG; | |||||
| #define MMPA_THREAD_SCHED_OTHER SCHED_OTHER | #define MMPA_THREAD_SCHED_OTHER SCHED_OTHER | ||||
| #define MMPA_THREAD_MIN_STACK_SIZE PTHREAD_STACK_MIN | #define MMPA_THREAD_MIN_STACK_SIZE PTHREAD_STACK_MIN | ||||
| #define MMPA_PATH_SEPARATOR_STR "/" | |||||
| #define MMPA_PATH_SEPARATOR_CHAR '/' | |||||
| #define MM_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER | #define MM_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER | ||||
| #define MMPA_MAX_NI 19 | #define MMPA_MAX_NI 19 | ||||
| @@ -1,86 +1,83 @@ | |||||
| /** | |||||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MMPA_TYPEDEF_WIN_H | |||||
| #define MMPA_TYPEDEF_WIN_H | |||||
| #ifdef __cplusplus | |||||
| #if __cplusplus | |||||
| extern "C" { | |||||
| #endif // __cpluscplus | |||||
| #endif // __cpluscplus | |||||
| #ifndef FALSE | |||||
| #define FALSE 0 | |||||
| #endif | |||||
| #ifndef TRUE | |||||
| #define TRUE 1 | |||||
| #endif | |||||
| #define EN_OK 0 | |||||
| #define EN_ERR 1 | |||||
| #define EN_ERROR (-1) | |||||
| #define EN_INVALID_PARAM (-2) | |||||
| #define EN_TIMEOUT (-3) | |||||
| #define HANDLE_INVALID_VALUE (-1) | |||||
| #define INVALID_SOCKET_HANDLE INVALID_SOCKET | |||||
| #define MMPA_MEM_MAX_LEN (0x7fffffff) | |||||
| #define MMPA_PROCESS_ERROR (0x7fffffff) | |||||
| #define MMPA_ONE_THOUSAND 1000 | |||||
| #define MMPA_COMPUTER_BEGIN_YEAR 1900 | |||||
| #define SUMMER_TIME_OR_NOT (-1) | |||||
| #define MMPA_ZERO 0 | |||||
| #define MMPA_VALUE_ONE 1 | |||||
| #define MMPA_SOCKET_MAIN_EDITION 2 | |||||
| #define MMPA_SOCKET_SECOND_EDITION 0 | |||||
| #define MMPA_PIPE_BUF_SIZE 1024 | |||||
| #define MMPA_MAX_SCANDIR_COUNT 1024 | |||||
| #define MAX_IOVEC_SIZE 32 | |||||
| #define MMPA_PIPE_COUNT 2 | |||||
| #define MMPA_THREADNAME_SIZE 16 | |||||
| #define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1) | |||||
| #define MMPA_MIN_OS_VERSION_SIZE 64 | |||||
| #define MMPA_MAX_NI 19 | |||||
| #define MMPA_MIDDLE_NI 5 | |||||
| #define MMPA_LOW_NI (-5) | |||||
| #define MMPA_MIN_NI (-20) | |||||
| #define MMPA_MAX_FILE 128 | |||||
| #define MMPA_PATH_SEPARATOR_STR "\\" | |||||
| #define MMPA_PATH_SEPARATOR_CHAR '\\' | |||||
| #define MMPA_MAX_THREAD_PIO 99 | |||||
| #define MMPA_MIDDLE_THREAD_PIO 66 | |||||
| #define MMPA_LOW_THREAD_PIO 33 | |||||
| #define MMPA_MIN_THREAD_PIO 1 | |||||
| #define MMPA_THREAD_SCHED_RR 0 | |||||
| #define MMPA_THREAD_SCHED_FIFO 0 | |||||
| #define MMPA_THREAD_SCHED_OTHER 0 | |||||
| #define MMPA_THREAD_MIN_STACK_SIZE 0 | |||||
| #define MM_MUTEX_INITIALIZER NULL | |||||
| #ifdef __cplusplus | |||||
| #if __cplusplus | |||||
| } | |||||
| #endif // __cpluscplus | |||||
| #endif // __cpluscplus | |||||
| #endif // _MMPA_TYPEDEF_WIN_H_ | |||||
| /** | |||||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MMPA_TYPEDEF_WIN_H | |||||
| #define MMPA_TYPEDEF_WIN_H | |||||
| #ifdef __cplusplus | |||||
| #if __cplusplus | |||||
| extern "C" { | |||||
| #endif // __cpluscplus | |||||
| #endif // __cpluscplus | |||||
| #ifndef FALSE | |||||
| #define FALSE 0 | |||||
| #endif | |||||
| #ifndef TRUE | |||||
| #define TRUE 1 | |||||
| #endif | |||||
| #define EN_OK 0 | |||||
| #define EN_ERR 1 | |||||
| #define EN_ERROR (-1) | |||||
| #define EN_INVALID_PARAM (-2) | |||||
| #define EN_TIMEOUT (-3) | |||||
| #define HANDLE_INVALID_VALUE (-1) | |||||
| #define INVALID_SOCKET_HANDLE INVALID_SOCKET | |||||
| #define MMPA_MEM_MAX_LEN (0x7fffffff) | |||||
| #define MMPA_PROCESS_ERROR (0x7fffffff) | |||||
| #define MMPA_ONE_THOUSAND 1000 | |||||
| #define MMPA_COMPUTER_BEGIN_YEAR 1900 | |||||
| #define SUMMER_TIME_OR_NOT (-1) | |||||
| #define MMPA_ZERO 0 | |||||
| #define MMPA_VALUE_ONE 1 | |||||
| #define MMPA_SOCKET_MAIN_EDITION 2 | |||||
| #define MMPA_SOCKET_SECOND_EDITION 0 | |||||
| #define MMPA_PIPE_BUF_SIZE 1024 | |||||
| #define MMPA_MAX_SCANDIR_COUNT 1024 | |||||
| #define MAX_IOVEC_SIZE 32 | |||||
| #define MMPA_PIPE_COUNT 2 | |||||
| #define MMPA_THREADNAME_SIZE 16 | |||||
| #define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1) | |||||
| #define MMPA_MIN_OS_VERSION_SIZE 64 | |||||
| #define MMPA_MAX_NI 19 | |||||
| #define MMPA_MIDDLE_NI 5 | |||||
| #define MMPA_LOW_NI (-5) | |||||
| #define MMPA_MIN_NI (-20) | |||||
| #define MMPA_MAX_FILE 128 | |||||
| #define MMPA_MAX_THREAD_PIO 99 | |||||
| #define MMPA_MIDDLE_THREAD_PIO 66 | |||||
| #define MMPA_LOW_THREAD_PIO 33 | |||||
| #define MMPA_MIN_THREAD_PIO 1 | |||||
| #define MMPA_THREAD_SCHED_RR 0 | |||||
| #define MMPA_THREAD_SCHED_FIFO 0 | |||||
| #define MMPA_THREAD_SCHED_OTHER 0 | |||||
| #define MMPA_THREAD_MIN_STACK_SIZE 0 | |||||
| #define MM_MUTEX_INITIALIZER NULL | |||||
| #ifdef __cplusplus | |||||
| #if __cplusplus | |||||
| } | |||||
| #endif // __cpluscplus | |||||
| #endif // __cpluscplus | |||||
| #endif // _MMPA_TYPEDEF_WIN_H_ | |||||
| @@ -142,6 +142,74 @@ REG_OP(BatchNorm) | |||||
| .ATTR(is_training, Bool, true) | .ATTR(is_training, Bool, true) | ||||
| .OP_END_FACTORY_REG(BatchNorm) | .OP_END_FACTORY_REG(BatchNorm) | ||||
| /** | |||||
| * @brief After the mean and reciprocal of standard deviation(invert_std) are separately calculated on each device, | |||||
| * the mena and reciprocal of standard deviation(invert_std) data on each device are normlized, | |||||
| * a total mean and reciprocal of standard deviation(invert_std) are returned, and running_var are updated. | |||||
| * @par Inputs: | |||||
| * include: | |||||
| * @li mean_all: A Tensor. The mean of each device. Must be one of the following types: float16, float32. | |||||
| * @li invert_std_all: A Tensor. Reciprocal of the variances of each device. Must be one of the following types: float16, float32. | |||||
| * @li count_all: A Tensor. Number of data for each device. Must be one of the following types: float16, float32. | |||||
| * @li mean_broadcast: A Tensor. The overall average and broadcast. Must be one of the following types: float16, float32. | |||||
| * @li count_sum: A Tensor. General statistics. Must be one of the following types: float16, float32. | |||||
| * @li running_var: A Tensor. Runtime variance. Must be one of the following types: float16, float32. \n | |||||
| * @par Attributes: | |||||
| * Two Attributes, including: | |||||
| * @li momentum: A optional float. Defaults to 0.01. \n | |||||
| * @li epsilon: An optional float. Defaults to 0.00001. \n | |||||
| * @par Outputs: | |||||
| * include: | |||||
| * @li invert_std: A Tensor. It's inverse of total variance. | |||||
| * @li running_var_update: A Tensor. It's moving variance of each device after the update. \n | |||||
| * @par Third-party framework compatibility | |||||
| * ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate | |||||
| * compatible with the Pytorch operator BatchNormGatherStatsWithCounts. | |||||
| */ | |||||
| REG_OP(SyncBatchNormGatherStatsWithCounts) | |||||
| .INPUT(mean_all, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(invert_std_all, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(count_all, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(mean_broadcast, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(running_var, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .OUTPUT(invert_std, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .OUTPUT(running_var_update, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .ATTR(momentum, Float, 0.1) | |||||
| .ATTR(epsilon, Float, 0.001) | |||||
| .OP_END_FACTORY_REG(SyncBatchNormGatherStatsWithCounts) | |||||
| /** | |||||
| * @brief update running_mean. | |||||
| * @par Inputs: | |||||
| * include: | |||||
| * @li mean: A Tensor. The mean of each device. Must be one of the following types: float16, float32. | |||||
| * @li running_mean: A Tensor. Runtime Mean. Must be one of the following types: float16, float32. \n | |||||
| * @par Attributes: | |||||
| * One Attribute, including: | |||||
| * @li momentum: A optional float. Defaults to 0.01. \n | |||||
| * @par Outputs: | |||||
| * include: | |||||
| * @li running_mean_update: A Tensor. It's moving mean of each device after the update. \n | |||||
| * @par Third-party framework compatibility | |||||
| * ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate | |||||
| * compatible with the Pytorch operator BatchNormGatherStatsWithCounts. | |||||
| */ | |||||
| REG_OP(SyncBNTrainingUpdate) | |||||
| .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(running_mean, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .OUTPUT(running_mean_update, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .ATTR(momentum, Float, 0.1) | |||||
| .OP_END_FACTORY_REG(SyncBNTrainingUpdate) | |||||
| /** | /** | ||||
| *@brief part of SyncBatchNormBackward . \n | *@brief part of SyncBatchNormBackward . \n | ||||
| @@ -515,6 +515,34 @@ REG_OP(ReduceSumD) | |||||
| .ATTR(keep_dims, Bool, false) | .ATTR(keep_dims, Bool, false) | ||||
| .OP_END_FACTORY_REG(ReduceSumD) | .OP_END_FACTORY_REG(ReduceSumD) | ||||
| /** | |||||
| *@brief Calculate the total mean based on the mean of each device . \n | |||||
| *@par Inputs: | |||||
| * Three inputs, including: | |||||
| *@li x: A Tensor. Must be one of the following types: float16, float32 . | |||||
| *@li count: A Tensor. Must be one of the following types: float16, float32 . | |||||
| *@li count_sum: A Tensor. Must be one of the following types: float16, float32 . \n | |||||
| *@par Attributes: | |||||
| *@li axes: A required 1D list or tuple of int32 or int64. Specifies the dimensions to reduce. | |||||
| *@li keepdims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n | |||||
| *@par Outputs: | |||||
| *y: The reduced tensor. Has the same type and format as input "x" . \n | |||||
| *@par Third-party framework compatibility | |||||
| * Compatible with the TensorFlow operator Sum. | |||||
| */ | |||||
| REG_OP(ReduceMeanWithCount) | |||||
| .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(count, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) | |||||
| .REQUIRED_ATTR(axes, ListInt) | |||||
| .ATTR(keep_dims, Bool, false) | |||||
| .OP_END_FACTORY_REG(ReduceMeanWithCount) | |||||
| /** | /** | ||||
| *@brief Calculates the "logical sum" of elements of a tensor in a dimension . \n | *@brief Calculates the "logical sum" of elements of a tensor in a dimension . \n | ||||
| @@ -1363,6 +1391,64 @@ REG_OP(ReduceStdV2Update) | |||||
| .ATTR(unbiased, Bool, true) | .ATTR(unbiased, Bool, true) | ||||
| .ATTR(keepdim, Bool, false) | .ATTR(keepdim, Bool, false) | ||||
| .OP_END_FACTORY_REG(ReduceStdV2Update) | .OP_END_FACTORY_REG(ReduceStdV2Update) | ||||
| /** | |||||
| *@brief Computes the log and sum and exp of elements across dimensions of a tensor. | |||||
| * Reduces "x" along the dimensions given in "axes". | |||||
| * Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each | |||||
| * entry in "axes". If "keep_dims" is true, the reduced dimensions | |||||
| * are retained with length 1. | |||||
| * | |||||
| *@par Inputs: | |||||
| * Two inputs, including: | |||||
| *@li x: A Tensor. Must be one of the following types: | |||||
| * float32, float16, int32, int64, uint32, uint64, double | |||||
| *@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n | |||||
| * | |||||
| *@par Attributes: | |||||
| *keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n | |||||
| * | |||||
| *@par Outputs: | |||||
| *y: The reduced tensor. Has the same type and format as input "x" . \n | |||||
| * | |||||
| *@par Third-party framework compatibility | |||||
| * Compatible with the Onnx operator ReduceLogSumExp. | |||||
| */ | |||||
| REG_OP(ReduceLogSumExp) | |||||
| .INPUT(x, TensorType::NumberType()) | |||||
| .INPUT(axes, TensorType::IndexNumberType()) | |||||
| .OUTPUT(y, TensorType::NumberType()) | |||||
| .ATTR(keep_dims, Bool, false) | |||||
| .OP_END_FACTORY_REG(ReduceLogSumExp) | |||||
| /** | |||||
| *@brief Computes the log and sum of elements across dimensions of a tensor. | |||||
| * Reduces "x" along the dimensions given in "axes". | |||||
| * Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each | |||||
| * entry in "axes". If "keep_dims" is true, the reduced dimensions | |||||
| * are retained with length 1. | |||||
| * | |||||
| *@par Inputs: | |||||
| * Two inputs, including: | |||||
| *@li x: A Tensor. Must be one of the following types: | |||||
| * float32, float16, int32, int64, uint32, uint64, double | |||||
| *@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n | |||||
| * | |||||
| *@par Attributes: | |||||
| *keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n | |||||
| * | |||||
| *@par Outputs: | |||||
| *y: The reduced tensor. Has the same type and format as input "x" . \n | |||||
| * | |||||
| *@par Third-party framework compatibility | |||||
| * Compatible with the Onnx operator ReduceLogSum. | |||||
| */ | |||||
| REG_OP(ReduceLogSum) | |||||
| .INPUT(x, TensorType::NumberType()) | |||||
| .INPUT(axes, TensorType::IndexNumberType()) | |||||
| .OUTPUT(y, TensorType::NumberType()) | |||||
| .ATTR(keep_dims, Bool, false) | |||||
| .OP_END_FACTORY_REG(ReduceLogSum) | |||||
| } //namespace ge | } //namespace ge | ||||
| #endif // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_ | #endif // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_ | ||||
| @@ -1,8 +1,17 @@ | |||||
| /* | |||||
| * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. | |||||
| * Description: handle perf data | |||||
| * Author: xp | |||||
| * Create: 2019-10-13 | |||||
| /** | |||||
| * Copyright 2019-2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | */ | ||||
| #ifndef MSPROFILER_PROF_CALLBACK_H_ | #ifndef MSPROFILER_PROF_CALLBACK_H_ | ||||
| @@ -0,0 +1,450 @@ | |||||
| /* | |||||
| * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. | |||||
| * Description: handle perf data | |||||
| * Author: Huawei Technologies Co., Ltd. | |||||
| * Create: 2019-10-13 | |||||
| */ | |||||
| #ifndef MSPROFILER_PROF_COMMON_H_ | |||||
| #define MSPROFILER_PROF_COMMON_H_ | |||||
| #ifdef __cplusplus | |||||
| extern "C" { | |||||
| #endif // __cplusplus | |||||
| #include <stdint.h> | |||||
| #define MSPROF_DATA_HEAD_MAGIC_NUM 0x5a5a | |||||
| enum MsprofDataTag { | |||||
| MSPROF_ACL_DATA_TAG = 0, //acl data tag, range: 0~19 | |||||
| MSPROF_GE_DATA_TAG_MODEL_LOAD = 20, //ge data tag, range: 20~39 | |||||
| MSPROF_GE_DATA_TAG_FUSION = 21, | |||||
| MSPROF_GE_DATA_TAG_INFER = 22, | |||||
| MSPROF_GE_DATA_TAG_TASK = 23, | |||||
| MSPROF_GE_DATA_TAG_TENSOR = 24, | |||||
| MSPROF_GE_DATA_TAG_STEP = 25, | |||||
| MSPROF_GE_DATA_TAG_ID_MAP = 26, | |||||
| MSPROF_GE_DATA_TAG_HOST_SCH = 27, | |||||
| MSPROF_RUNTIME_DATA_TAG_API = 40, //runtime data tag, range: 40~59 | |||||
| MSPROF_RUNTIME_DATA_TAG_TRACK = 41, | |||||
| MSPROF_AICPU_DATA_TAG = 60, //aicpu data tag, range: 60~79 | |||||
| MSPROF_HCCL_DATA_TAG = 80, //hccl data tag, range: 80~99 | |||||
| MSPROF_DP_DATA_TAG = 100, //dp data tag, range: 100~119 | |||||
| MSPROF_MSPROFTX_DATA_TAG = 120, //hccl data tag, range: 120~139 | |||||
| MSPROF_DATA_TAG_MAX = 65536, //data tag value type is uint16_t | |||||
| }; | |||||
| /** | |||||
| * @brief struct of mixed data | |||||
| */ | |||||
| #define MSPROF_MIX_DATA_RESERVE_BYTES 7 | |||||
| #define MSPROF_MIX_DATA_STRING_LEN 120 | |||||
| enum MsprofMixDataType { | |||||
| MSPROF_MIX_DATA_HASH_ID = 0, | |||||
| MSPROF_MIX_DATA_STRING, | |||||
| }; | |||||
| struct MsprofMixData { | |||||
| uint8_t type; // MsprofMixDataType | |||||
| uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES]; | |||||
| union { | |||||
| uint64_t hashId; | |||||
| char dataStr[MSPROF_MIX_DATA_STRING_LEN]; | |||||
| } data; | |||||
| }; | |||||
| using MixData = struct MsprofMixData; | |||||
| /** | |||||
| * @brief profiling command info | |||||
| */ | |||||
| #define MSPROF_MAX_DEV_NUM 64 | |||||
| struct MsprofCommandHandle { | |||||
| uint64_t profSwitch; | |||||
| uint64_t profSwitchHi; | |||||
| uint32_t devNums; | |||||
| uint32_t devIdList[MSPROF_MAX_DEV_NUM]; | |||||
| uint32_t modelId; | |||||
| uint32_t type; | |||||
| }; | |||||
| /** | |||||
| * @brief struct of data reported by acl | |||||
| */ | |||||
| #define MSPROF_ACL_DATA_RESERVE_BYTES 32 | |||||
| #define MSPROF_ACL_API_NAME_LEN 64 | |||||
| enum MsprofAclApiType { | |||||
| MSPROF_ACL_API_TYPE_OP = 1, | |||||
| MSPROF_ACL_API_TYPE_MODEL, | |||||
| MSPROF_ACL_API_TYPE_RUNTIME, | |||||
| MSPROF_ACL_API_TYPE_OTHERS, | |||||
| }; | |||||
| struct MsprofAclProfData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_ACL_DATA_TAG; | |||||
| uint32_t apiType; // enum MsprofAclApiType | |||||
| uint64_t beginTime; | |||||
| uint64_t endTime; | |||||
| uint32_t processId; | |||||
| uint32_t threadId; | |||||
| char apiName[MSPROF_ACL_API_NAME_LEN]; | |||||
| uint8_t reserve[MSPROF_ACL_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| /** | |||||
| * @brief struct of data reported by GE | |||||
| */ | |||||
| #define MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES 104 | |||||
| struct MsprofGeProfModelLoadData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_MODEL_LOAD; | |||||
| uint32_t modelId; | |||||
| MixData modelName; | |||||
| uint64_t startTime; | |||||
| uint64_t endTime; | |||||
| uint8_t reserve[MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_FUSION_DATA_RESERVE_BYTES 8 | |||||
| #define MSPROF_GE_FUSION_OP_NUM 8 | |||||
| struct MsprofGeProfFusionData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_FUSION; | |||||
| uint32_t modelId; | |||||
| MixData fusionName; | |||||
| uint64_t inputMemSize; | |||||
| uint64_t outputMemSize; | |||||
| uint64_t weightMemSize; | |||||
| uint64_t workspaceMemSize; | |||||
| uint64_t totalMemSize; | |||||
| uint64_t fusionOpNum; | |||||
| uint64_t fusionOp[MSPROF_GE_FUSION_OP_NUM]; | |||||
| uint8_t reserve[MSPROF_GE_FUSION_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_INFER_DATA_RESERVE_BYTES 64 | |||||
| struct MsprofGeProfInferData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_INFER; | |||||
| uint32_t modelId; | |||||
| MixData modelName; | |||||
| uint32_t requestId; | |||||
| uint32_t threadId; | |||||
| uint64_t inputDataStartTime; | |||||
| uint64_t inputDataEndTime; | |||||
| uint64_t inferStartTime; | |||||
| uint64_t inferEndTime; | |||||
| uint64_t outputDataStartTime; | |||||
| uint64_t outputDataEndTime; | |||||
| uint8_t reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_TASK_DATA_RESERVE_BYTES 16 | |||||
| #define MSPROF_GE_OP_TYPE_LEN 56 | |||||
| enum MsprofGeTaskType { | |||||
| MSPROF_GE_TASK_TYPE_AI_CORE = 0, | |||||
| MSPROF_GE_TASK_TYPE_AI_CPU, | |||||
| MSPROF_GE_TASK_TYPE_AIV, | |||||
| }; | |||||
| enum MsprofGeShapeType { | |||||
| MSPROF_GE_SHAPE_TYPE_STATIC = 0, | |||||
| MSPROF_GE_SHAPE_TYPE_DYNAMIC, | |||||
| }; | |||||
| struct MsprofGeOpType { | |||||
| uint8_t type; // MsprofMixDataType | |||||
| uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES]; | |||||
| union { | |||||
| uint64_t hashId; | |||||
| char dataStr[MSPROF_GE_OP_TYPE_LEN]; | |||||
| } data; | |||||
| }; | |||||
| struct MsprofGeProfTaskData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_TASK; | |||||
| uint32_t taskType; // MsprofGeTaskType | |||||
| MixData opName; | |||||
| MsprofGeOpType opType; | |||||
| uint64_t curIterNum; | |||||
| uint64_t timeStamp; | |||||
| uint32_t shapeType; // MsprofGeShapeType | |||||
| uint32_t blockDims; | |||||
| uint32_t modelId; | |||||
| uint32_t streamId; | |||||
| uint32_t taskId; | |||||
| uint32_t threadId; | |||||
| uint8_t reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_TENSOR_DATA_RESERVE_BYTES 8 | |||||
| #define MSPROF_GE_TENSOR_DATA_SHAPE_LEN 8 | |||||
| #define MSPROF_GE_TENSOR_DATA_NUM 5 | |||||
| enum MsprofGeTensorType { | |||||
| MSPROF_GE_TENSOR_TYPE_INPUT = 0, | |||||
| MSPROF_GE_TENSOR_TYPE_OUTPUT, | |||||
| }; | |||||
| struct MsprofGeTensorData { | |||||
| uint32_t tensorType; // MsprofGeTensorType | |||||
| uint32_t format; | |||||
| uint32_t dataType; | |||||
| uint32_t shape[MSPROF_GE_TENSOR_DATA_SHAPE_LEN]; | |||||
| }; | |||||
| struct MsprofGeProfTensorData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_TENSOR; | |||||
| uint32_t modelId; | |||||
| uint64_t curIterNum; | |||||
| uint32_t streamId; | |||||
| uint32_t taskId; | |||||
| uint32_t tensorNum; | |||||
| MsprofGeTensorData tensorData[MSPROF_GE_TENSOR_DATA_NUM]; | |||||
| uint8_t reserve[MSPROF_GE_TENSOR_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_STEP_DATA_RESERVE_BYTES 27 | |||||
| enum MsprofGeStepTag { | |||||
| MSPROF_GE_STEP_TAG_BEGIN = 0, | |||||
| MSPROF_GE_STEP_TAG_END, | |||||
| }; | |||||
| struct MsprofGeProfStepData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_STEP; | |||||
| uint32_t modelId; | |||||
| uint32_t streamId; | |||||
| uint32_t taskId; | |||||
| uint64_t timeStamp; | |||||
| uint64_t curIterNum; | |||||
| uint32_t threadId; | |||||
| uint8_t tag; // MsprofGeStepTag | |||||
| uint8_t reserve[MSPROF_GE_STEP_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES 6 | |||||
| struct MsprofGeProfIdMapData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_ID_MAP; | |||||
| uint32_t graphId; | |||||
| uint32_t modelId; | |||||
| uint32_t sessionId; | |||||
| uint64_t timeStamp; | |||||
| uint16_t mode; | |||||
| uint8_t reserve[MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES 24 | |||||
| struct MsprofGeProfHostSchData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_GE_DATA_TAG_HOST_SCH; | |||||
| uint32_t threadId; // record in start event | |||||
| uint64_t element; | |||||
| uint64_t event; | |||||
| uint64_t startTime; // record in start event | |||||
| uint64_t endTime; // record in end event | |||||
| uint8_t reserve[MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| /** | |||||
| * @brief struct of data reported by RunTime | |||||
| */ | |||||
| #define MSPROF_RUNTIME_API_DATA_RESERVE_BYTES 106 | |||||
| #define MSPROF_RUNTIME_TASK_ID_NUM 10 | |||||
| #define MSPROF_RUNTIME_API_NAME_LEN 64 | |||||
| struct MsprofRuntimeProfApiData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_API; | |||||
| uint32_t threadId; | |||||
| uint64_t entryTime; | |||||
| uint64_t exitTime; | |||||
| uint64_t dataSize; | |||||
| uint8_t apiName[MSPROF_RUNTIME_API_NAME_LEN]; | |||||
| uint32_t retCode; | |||||
| uint32_t streamId; | |||||
| uint32_t taskNum; | |||||
| uint32_t taskId[MSPROF_RUNTIME_TASK_ID_NUM]; | |||||
| uint16_t memcpyDirection; | |||||
| uint8_t reserve[MSPROF_RUNTIME_API_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| #define MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES 10 | |||||
| #define MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN 32 | |||||
| struct MsprofRuntimeProfTrackData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_TRACK; | |||||
| uint32_t threadId; | |||||
| uint64_t timeStamp; | |||||
| char taskType[MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN]; | |||||
| uint32_t taskId; | |||||
| uint16_t streamId; | |||||
| uint8_t reserve[MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| /** | |||||
| * @brief struct of data reported by RunTime | |||||
| */ | |||||
| #define MSPROF_AICPU_DATA_RESERVE_BYTES 9 | |||||
| struct MsprofAicpuProfData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_AICPU_DATA_TAG; | |||||
| uint16_t streamId; | |||||
| uint16_t taskId; | |||||
| uint64_t runStartTime; | |||||
| uint64_t runStartTick; | |||||
| uint64_t computeStartTime; | |||||
| uint64_t memcpyStartTime; | |||||
| uint64_t memcpyEndTime; | |||||
| uint64_t runEndTime; | |||||
| uint64_t runEndTick; | |||||
| uint32_t threadId; | |||||
| uint32_t deviceId; | |||||
| uint64_t submitTick; | |||||
| uint64_t scheduleTick; | |||||
| uint64_t tickBeforeRun; | |||||
| uint64_t tickAfterRun; | |||||
| uint32_t kernelType; | |||||
| uint32_t dispatchTime; | |||||
| uint32_t totalTime; | |||||
| uint16_t fftsThreadId; | |||||
| uint8_t version; | |||||
| uint8_t reserve[MSPROF_AICPU_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| /** | |||||
| * @brief struct of data reported by DP | |||||
| */ | |||||
| #define MSPROF_DP_DATA_RESERVE_BYTES 16 | |||||
| #define MSPROF_DP_DATA_ACTION_LEN 16 | |||||
| #define MSPROF_DP_DATA_SOURCE_LEN 64 | |||||
| struct MsprofDpProfData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_DP_DATA_TAG; | |||||
| uint32_t rsv; // Ensure 8-byte alignment | |||||
| uint64_t timeStamp; | |||||
| char action[MSPROF_DP_DATA_ACTION_LEN]; | |||||
| char source[MSPROF_DP_DATA_SOURCE_LEN]; | |||||
| uint64_t index; | |||||
| uint64_t size; | |||||
| uint8_t reserve[MSPROF_DP_DATA_RESERVE_BYTES]; | |||||
| }; | |||||
| /** | |||||
| * @brief struct of data reported by HCCL | |||||
| */ | |||||
| #pragma pack(4) | |||||
| struct MsprofHcclProfNotify { | |||||
| uint32_t taskID; | |||||
| uint64_t notifyID; | |||||
| uint32_t stage; | |||||
| uint32_t remoteRank; | |||||
| uint32_t transportType; | |||||
| uint32_t role; // role {0: dst, 1:src} | |||||
| double durationEstimated; | |||||
| }; | |||||
| struct MsprofHcclProfReduce { | |||||
| uint32_t taskID; | |||||
| uint64_t src; | |||||
| uint64_t dst; | |||||
| uint64_t size; | |||||
| uint32_t op; // {0: sum, 1: mul, 2: max, 3: min} | |||||
| uint32_t dataType; // data type {0: INT8, 1: INT16, 2: INT32, 3: FP16, 4:FP32, 5:INT64, 6:UINT64} | |||||
| uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} | |||||
| uint32_t remoteRank; | |||||
| uint32_t transportType; // transport type {0: SDMA, 1: RDMA, 2:LOCAL} | |||||
| uint32_t role; // role {0: dst, 1:src} | |||||
| double durationEstimated; | |||||
| }; | |||||
| struct MsprofHcclProfRDMA { | |||||
| uint32_t taskID; | |||||
| uint64_t src; | |||||
| uint64_t dst; | |||||
| uint64_t size; | |||||
| uint64_t notifyID; | |||||
| uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} | |||||
| uint32_t remoteRank; | |||||
| uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL} | |||||
| uint32_t role; // role {0: dst, 1:src} | |||||
| uint32_t type; // RDMA type {0: RDMASendNotify, 1:RDMASendPayload} | |||||
| double durationEstimated; | |||||
| }; | |||||
| struct MsprofHcclProfMemcpy { | |||||
| uint32_t taskID; | |||||
| uint64_t src; | |||||
| uint64_t dst; | |||||
| uint64_t size; | |||||
| uint64_t notifyID; | |||||
| uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} | |||||
| uint32_t remoteRank; | |||||
| uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL} | |||||
| uint32_t role; // role {0: dst, 1:src} | |||||
| double durationEstimated; | |||||
| }; | |||||
| struct MsprofHcclProfStageStep { | |||||
| uint32_t rank; | |||||
| uint32_t rankSize; | |||||
| }; | |||||
| struct MsprofHcclProfFlag { | |||||
| uint64_t cclTag; | |||||
| uint64_t groupName; | |||||
| uint32_t localRank; | |||||
| uint32_t workFlowMode; | |||||
| }; | |||||
| /** | |||||
| * @name MsprofHcclProfData | |||||
| * @brief struct of data reported by hccl | |||||
| */ | |||||
| struct MsprofHcclProfData { | |||||
| uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; | |||||
| uint16_t dataTag = MSPROF_HCCL_DATA_TAG; | |||||
| uint32_t planeID; | |||||
| uint32_t deviceID; | |||||
| uint32_t streamID; | |||||
| double ts; | |||||
| char name[16]; | |||||
| union { | |||||
| MsprofHcclProfNotify notify; | |||||
| MsprofHcclProfReduce reduce; | |||||
| MsprofHcclProfStageStep stageStep; | |||||
| MsprofHcclProfMemcpy forMemcpy; | |||||
| MsprofHcclProfRDMA RDMA; | |||||
| MsprofHcclProfFlag flag; | |||||
| } args; | |||||
| }; | |||||
| #pragma pack() | |||||
| /** | |||||
| * @name MsprofStampInfo | |||||
| * @brief struct of data reported by msproftx | |||||
| */ | |||||
| struct MsprofStampInfo { | |||||
| uint16_t magicNumber; | |||||
| uint16_t dataTag; | |||||
| uint32_t processId; | |||||
| uint32_t threadId; | |||||
| uint32_t category; //marker category | |||||
| uint32_t eventType; | |||||
| int32_t payloadType; | |||||
| union PayloadValue //payload info for marker | |||||
| { | |||||
| uint64_t ullValue; | |||||
| int64_t llValue; | |||||
| double dValue; | |||||
| uint32_t uiValue[2]; | |||||
| int32_t iValue[2]; | |||||
| float fValue[2]; | |||||
| } payload; | |||||
| uint64_t startTime; | |||||
| uint64_t endTime; | |||||
| int32_t messageType; | |||||
| char message[128]; | |||||
| uint8_t reserve0[4]; | |||||
| uint8_t reserve1[72]; | |||||
| }; | |||||
| #ifdef __cplusplus | |||||
| } | |||||
| #endif | |||||
| #endif // MSPROFILER_PROF_COMMON_H_ | |||||