upgrade Ascend package 27 Apr 22

3 years ago · 13887303d0
--- a/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h
+++ b/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h
@@ -18,38 +18,31 @@
 #define AICPU_OP_TYPE_LIST_H_

 extern "C" {
 enum OpKernelType {
    TF_KERNEL,
    CPU_KERNEL
 };
 enum OpKernelType { TF_KERNEL, CPU_KERNEL };

 enum ReturnCode {
    OP_TYPE_NOT_SUPPORT,
    FORMAT_NOT_SUPPORT,
    DTYPE_NOT_SUPPORT
 };
 enum ReturnCode { OP_TYPE_NOT_SUPPORT, FORMAT_NOT_SUPPORT, DTYPE_NOT_SUPPORT };

 #pragma pack(push, 1)
 // One byte alignment
 struct SysOpInfo {
    uint64_t opLen;
    uint64_t opType;
    OpKernelType kernelsType;
  uint64_t opLen;
  uint64_t opType;
  OpKernelType kernelsType;
 };

 struct SysOpCheckInfo {
    uint64_t opListNum;
    uint64_t offSetLen;
    uint64_t sysOpInfoList;
    uint64_t opParamInfoList;
  uint64_t opListNum;
  uint64_t offSetLen;
  uint64_t sysOpInfoList;
  uint64_t opParamInfoList;
 };

 struct SysOpCheckResp {
    uint64_t opListNum;
    bool isWithoutJson;
    uint64_t returnCodeList;
    uint64_t sysOpInfoList;
    uint64_t opParamInfoList;
  uint64_t opListNum;
  bool isWithoutJson;
  uint64_t returnCodeList;
  uint64_t sysOpInfoList;
  uint64_t opParamInfoList;
 };
 #pragma pack(pop)
 }
--- a/inc/aicpu/common/aicpu_task_struct.h
+++ b/inc/aicpu/common/aicpu_task_struct.h
@@ -0,0 +1,104 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef AICPU_TASK_STRUCT_H
 #define AICPU_TASK_STRUCT_H

 #include <cstdint>

 namespace aicpu {

 using char_t = char;

 #pragma pack(push, 1)
 struct AicpuParamHead {
  uint32_t length;         // Total length: include cunstom message
  uint32_t ioAddrNum;      // Input and output address number
  uint32_t extInfoLength;  // extInfo struct Length
  uint64_t extInfoAddr;    // extInfo address
 };

 enum class AicpuConfigMsgType {
  AICPU_CONFIG_MSG_TYPE_BUF_FREE = 0,     /* free buf */
  AICPU_CONFIG_MSG_TYPE_BUF_RESET = 1,    /* reset buf */
  AICPU_CONFIG_MSG_TYPE_BUF_SET_ADDR = 2, /* set buf addr to aicpu */
 };

 enum class AicpuErrMsgType {
  ERR_MSG_TYPE_NULL = 0,
  ERR_MSG_TYPE_AICORE = 1,
  ERR_MSG_TYPE_AICPU = 2,
 };

 enum class AicpuExtInfoMsgType {
  EXT_MODEL_ID_MSG_TYPE = 0,
 };

 struct AicpuConfigMsg {
  uint8_t msgType;
  uint8_t reserved1;
  uint16_t bufLen;
  uint32_t offset;
  uint64_t bufAddr;
  uint32_t tsId;
  uint32_t reserved2;
 };

 struct AicpuModelIdInfo {
  uint32_t modelId;
  uint32_t extendModelId;
  uint32_t extendInfo[13];
 };

 // 64 bytes
 struct AicpuExtendInfo {
  uint8_t msgType;
  uint8_t version;
  uint8_t reserved[2];
  union {
    AicpuModelIdInfo modelIdMap;
  };
 };

 struct AicoreErrMsgInfo {
  uint8_t errType;
  uint8_t version;
  uint8_t reserved1[2]; /* reserved1, 4 byte alignment */
  uint32_t errorCode;
  uint32_t modelId;
  uint32_t taskId;
  uint32_t streamId;
  uint64_t transactionId;
  uint8_t reserved2[228]; /* the total byte is 256, reserved2 len = 256 - other lens */
 };

 struct AicpuErrMsgInfo {
  uint8_t errType;
  uint8_t version;
  uint8_t reserved1[2]; /* reserved1, 4 byte alignment */
  uint32_t errorCode;
  uint32_t modelId;
  uint32_t streamId;
  uint64_t transactionId;
  char_t opName[64];     /* op name str */
  char_t errDesc[128];   /* err msg desc info */
  uint8_t reserved2[40]; /* the total byte is 256, reserved2 len = 256 - other lens */
 };
 #pragma pack(pop)

 }  // namespace aicpu

 #endif  // AICPU_TASK_STRUCT_H
--- a/inc/aicpu/common/type_def.h
+++ b/inc/aicpu/common/type_def.h
@@ -0,0 +1,48 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * Description:interface.
 * Create: 2021-12-21
 */
 #ifndef AICPU_TYPE_DEF_H
 #define AICPU_TYPE_DEF_H

 #include <cstdint>
 #include <cstddef>
 #ifndef char_t
 typedef char char_t;
 #endif

 #ifndef float32_t
 typedef float float32_t;
 #endif

 #ifndef float64_t
 typedef double float64_t;
 #endif

 inline uint64_t PtrToValue(const void *ptr) {
  return static_cast<const uint64_t>(reinterpret_cast<const uintptr_t>(ptr));
 }

 inline void *ValueToPtr(const uint64_t value) {
  return reinterpret_cast<void *>(static_cast<const uintptr_t>(value));
 }

 template <typename TI, typename TO>
 inline TO *PtrToPtr(TI *ptr) {
  return reinterpret_cast<TO *>(ptr);
 }

 template <typename T>
 inline T *PtrAdd(T *const ptr, const size_t maxIdx, const size_t idx) {
  if ((ptr != nullptr) && (idx < maxIdx)) {
    return reinterpret_cast<T *>(ptr + idx);
  }
  return nullptr;
 }
 #endif  // AICPU_TYPE_DEF_H
--- a/third_party/fwkacllib/inc/tsd/status.h
+++ b/third_party/fwkacllib/inc/tsd/status.h
@@ -19,11 +19,11 @@
 #include "common/type_def.h"
 namespace tsd {
 #ifdef __cplusplus
    using TSD_StatusT = uint32_t;
 using TSD_StatusT = uint32_t;
 #else
    typedef uint32_t TSD_StatusT;
 typedef uint32_t TSD_StatusT;
 #endif
    // success code
    constexpr TSD_StatusT TSD_OK = 0U;
 }
 // success code
 constexpr TSD_StatusT TSD_OK = 0U;
 }  // namespace tsd
 #endif  // INC_TDT_STATUS_H
--- a/inc/external/acl/acl_base.h
+++ b/inc/external/acl/acl_base.h
@@ -114,6 +114,7 @@ static const int ACL_ERROR_INVALID_MAX_OPQUEUE_NUM_CONFIG = 148048;
 static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
 static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;
 static const int ACL_ERROR_RELATIVE_RESOURCE_NOT_CLEARED = 148051;
 static const int ACL_ERROR_UNSUPPORTED_JPEG = 148052;

 static const int ACL_ERROR_BAD_ALLOC = 200000;
 static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
@@ -153,7 +154,8 @@ typedef enum {
  ACL_BOOL = 12,
  ACL_STRING = 13,
  ACL_COMPLEX64 = 16,
  ACL_COMPLEX128 = 17
  ACL_COMPLEX128 = 17,
  ACL_BF16 = 27
 } aclDataType;

 typedef enum {
--- a/inc/external/acl/acl_prof.h
+++ b/inc/external/acl/acl_prof.h
@@ -60,6 +60,15 @@ typedef enum {
  ACL_STEP_END = 1     // step  end
 } aclprofStepTag;

 typedef enum {
  ACL_SUBSCRIBE_OP = 0,
  ACL_SUBSCRIBE_SUBGRAPH = 1,
  ACL_SUBSCRIBE_OP_THREAD = 2,
  ACL_SUBSCRIBE_NONE
 } aclprofSubscribeOpFlag;

 typedef enum { ACL_SUBSCRIBE_ATTRI_THREADID = 0, ACL_SUBSCRIBE_ATTRI_NONE } aclprofSubscribeOpAttri;

 typedef struct aclprofConfig aclprofConfig;
 typedef struct aclprofStopConfig aclprofStopConfig;
 typedef struct aclprofAicoreEvents aclprofAicoreEvents;
@@ -338,6 +347,34 @@ MSVP_PROF_API uint64_t aclprofGetOpDuration(const void *opInfo, size_t opInfoLen
 */
 MSVP_PROF_API size_t aclprofGetModelId(const void *opInfo, size_t opInfoLen, uint32_t index);

 /**
 * @ingroup AscendCL
 * @brief get op flag from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 * @param  index [IN]      index of op array in opInfo
 *
 * @retval op flag
 * @retval ACL_SUBSCRIBE_NONE for failed
 */
 MSVP_PROF_API aclprofSubscribeOpFlag aclprofGetOpFlag(const void *opInfo, size_t opInfoLen, uint32_t index);

 /**
 * @ingroup AscendCL
 * @brief get op flag from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 * @param  index [IN]      index of op array in opInfo
 * @param  attri [IN]      attribute of op
 *
 * @retval op flag
 * @retval NULL for failed
 */
 MSVP_PROF_API const char *aclprofGetOpAttriValue(const void *opInfo, size_t opInfoLen, uint32_t index,
                                                 aclprofSubscribeOpAttri attri);

 /**
 * @ingroup AscendCL
 * @brief
--- a/inc/external/acl/acl_tdt.h
+++ b/inc/external/acl/acl_tdt.h
@@ -197,6 +197,17 @@ ACL_FUNC_VISIBILITY aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDat
 */
 ACL_FUNC_VISIBILITY size_t acltdtGetDatasetSize(const acltdtDataset *dataset);

 /**
 * @ingroup AscendCL
 * @brief Get the name of dataset
 *
 * @param  dataset [IN]      pointer to the dataset
 *
 * @retval null for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY const char *acltdtGetDatasetName(const acltdtDataset *dataset);

 /**
 * @ingroup AscendCL
 * @brief Stop the channel
--- a/inc/external/acl/error_codes/ge_error_codes.h
+++ b/inc/external/acl/error_codes/ge_error_codes.h
@@ -59,6 +59,7 @@ static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021U;
 static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022U;
 static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000U;
 static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001U;
 static const uint32_t ACL_ERROR_GE_DEVICE_MEMORY_OPERATE_FAILED = 245002U;
 static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000U;
 static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001U;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002U;
--- a/inc/external/acl/error_codes/rt_error_codes.h
+++ b/inc/external/acl/error_codes/rt_error_codes.h
@@ -45,6 +45,7 @@ static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016;       // invalid
 static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017;            // invalid handle
 static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018;       // invalid malloc type
 static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019;              // wait timeout
 static const int32_t ACL_ERROR_RT_TASK_TIMEOUT = 107020;              // task timeout

 static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000;  // feature not support
 static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001;    // memory allocation error
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -120,6 +120,8 @@ const char_t *const PERFORMANCE_MODE = "ge.performance_mode";
 const char_t *const SHAPE_GENERALIZED_BUILD_MODE = "ge.shape_generalized_build_mode";
 const char_t *const MODIFY_MIXLIST = "ge.exec.modify_mixlist";
 const char_t *const OP_PRECISION_MODE = "ge.exec.op_precision_mode";
 const char_t *const CUSTOMIZE_DTYPES = "ge.customizeDtypes";
 const char_t *const COMPRESSION_OPTIMIZE_CONF = "ge.compressionOptimizeConf";
 }  // namespace configure_option
 // Configure stream num by Session constructor options param,
 // its value should be int32_t type, default value is "1"
@@ -268,9 +270,18 @@ const std::string ENABLE_SMALL_CHANNEL = "ge.enableSmallChannel";
 // Configure Compress Weight flag
 const std::string ENABLE_COMPRESS_WEIGHT = "ge.enableCompressWeight";

 // Configure Sparse Matrix Weight flag
 const std::string ENABLE_SPARSE_MATRIX_WEIGHT = "ge.enableSparseMatrixWeight";

 // Configure fusion switch file path
 const std::string FUSION_SWITCH_FILE = "ge.fusionSwitchFile";

 // Configure compression optimize file path
 const std::string COMPRESSION_OPTIMIZE_CONF = "ge.compressionOptimizeConf";

 // Configure customize dtypes path
 const std::string CUSTOMIZE_DTYPES = "ge.customizeDtypes";

 // Save original model
 const std::string SAVE_ORIGINAL_MODEL = "ge.saveOriginalModel";

@@ -289,6 +300,10 @@ const char_t *const ENABLE_PRINT_OP_PASS = "ge.enablePrintOpPass";
 // Its value should be file path, default value is "./"
 const char_t *const DEBUG_DIR = "ge.debugDir";

 // Configure switch for op status check such as overflow
 // Its value should be true of flase
 const char_t *const STATUS_CHECK = "ge.status_check";

 // Configure operator compiler cache path
 // Its value should be file path, default value is "./"
 const char_t *const OP_COMPILER_CACHE_DIR = "ge.op_compiler_cache_dir";
@@ -411,6 +426,7 @@ static const char_t *const OP_SELECT_IMPL_MODE = ge::OP_SELECT_IMPL_MODE.c_str()
 static const char_t *const OUTPUT_TYPE = ge::OUTPUT_DATATYPE.c_str();
 static const char_t *const BUFFER_OPTIMIZE = ge::BUFFER_OPTIMIZE.c_str();
 static const char_t *const ENABLE_COMPRESS_WEIGHT = ge::ENABLE_COMPRESS_WEIGHT.c_str();
 static const char_t *const SPARSITY = ge::ENABLE_SPARSE_MATRIX_WEIGHT.c_str();
 static const char_t *const COMPRESS_WEIGHT_CONF = "compress_weight_conf";
 static const char_t *const OUT_NODES = ge::OUTPUT_NODE_NAME.c_str();
 static const char_t *const INPUT_FP16_NODES = ge::INPUT_FP16_NODES.c_str();
@@ -427,6 +443,8 @@ static const char_t *const PERFORMANCE_MODE = ge::PERFORMANCE_MODE.c_str();
 static const char_t *const SHAPE_GENERALIZED_BUILD_MODE = ge::SHAPE_GENERALIZED_BUILD_MODE.c_str();
 static const char_t *const MODIFY_MIXLIST = ge::MODIFY_MIXLIST.c_str();
 static const char_t *const OP_PRECISION_MODE = ge::OP_PRECISION_MODE.c_str();
 static const char_t *const CUSTOMIZE_DTYPES = "ge.customizeDtypes";
 static const char_t *const COMPRESSION_OPTIMIZE_CONF = "ge.compressionOptimizeConf";

 // for interface: aclgrphBuildModel
 #ifdef __GNUC__
@@ -456,7 +474,8 @@ const std::set<std::string> ir_builder_suppported_options = {INPUT_FORMAT,
                                                             OP_BANK_UPDATE,
                                                             PERFORMANCE_MODE,
                                                             SHAPE_GENERALIZED_BUILD_MODE,
                                                             MODIFY_MIXLIST};
                                                             MODIFY_MIXLIST,
                                                             CUSTOMIZE_DTYPES};

 // for interface: aclgrphParse
 const std::set<std::string> ir_parser_suppported_options = {
@@ -469,6 +488,7 @@ const std::set<std::string> global_options = {CORE_TYPE,
                                              BUFFER_OPTIMIZE,
                                              ENABLE_COMPRESS_WEIGHT,
                                              COMPRESS_WEIGHT_CONF,
                                              SPARSITY,
                                              PRECISION_MODE,
                                              TUNE_DEVICE_IDS,
                                              EXEC_DISABLE_REUSED_MEMORY,
@@ -483,7 +503,8 @@ const std::set<std::string> global_options = {CORE_TYPE,
                                              DEBUG_DIR,
                                              OP_COMPILER_CACHE_DIR,
                                              OP_COMPILER_CACHE_MODE,
                                              MODIFY_MIXLIST};
                                              MODIFY_MIXLIST,
                                              COMPRESSION_OPTIMIZE_CONF};
 #endif
 }  // namespace ir_option
 }  // namespace ge
--- a/inc/external/ge/ge_error_codes.h
+++ b/inc/external/ge/ge_error_codes.h
@@ -59,6 +59,7 @@ static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021U;
 static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022U;
 static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000U;
 static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001U;
 static const uint32_t ACL_ERROR_GE_DEVICE_MEMORY_OPERATE_FAILED = 245002U;
 static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000U;
 static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001U;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002U;
--- a/inc/external/hccl/hccl_types.h
+++ b/inc/external/hccl/hccl_types.h
@@ -88,7 +88,6 @@ typedef enum {
 } HcclDataType;

 const uint32_t HCCL_ROOT_INFO_BYTES = 4108;  // 4108: root info length

 /**
 * @brief HCCL root info
 */
--- a/inc/external/runtime/rt_error_codes.h
+++ b/inc/external/runtime/rt_error_codes.h
@@ -45,6 +45,7 @@ static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016;       // invalid
 static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017;            // invalid handle
 static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018;       // invalid malloc type
 static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019;              // wait timeout
 static const int32_t ACL_ERROR_RT_TASK_TIMEOUT = 107020;              // task timeout

 static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000;  // feature not support
 static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001;    // memory allocation error
--- a/inc/framework/common/debug/log.h
+++ b/inc/framework/common/debug/log.h
@@ -129,86 +129,6 @@
    }                                          \
  }

 // If expr is not true, print the log and execute a custom statement
 #define GE_CHK_BOOL_EXEC_WARN(expr, exec_expr, ...) \
  {                                                 \
    const bool b = (expr);                          \
    if (!b) {                                       \
      GELOGW(__VA_ARGS__);                          \
      exec_expr;                                    \
    }                                               \
  }
 // If expr is not true, print the log and execute a custom statement
 #define GE_CHK_BOOL_EXEC_INFO(expr, exec_expr, ...) \
  {                                                 \
    const bool b = (expr);                          \
    if (!b) {                                       \
      GELOGI(__VA_ARGS__);                          \
      exec_expr;                                    \
    }                                               \
  }

 // If expr is not true, print the log and execute a custom statement
 #define GE_CHK_BOOL_TRUE_EXEC_INFO(expr, exec_expr, ...) \
  {                                                      \
    const bool b = (expr);                               \
    if (b) {                                             \
      GELOGI(__VA_ARGS__);                               \
      exec_expr;                                         \
    }                                                    \
  }

 // If expr is true, print logs and execute custom statements
 #define GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(expr, exec_expr, ...) \
  {                                                          \
    const bool b = (expr);                                   \
    if (b) {                                                 \
      GELOGE(ge::FAILED, __VA_ARGS__);                       \
      exec_expr;                                             \
    }                                                        \
  }
 // If expr is true, print the Information log and execute a custom statement
 #define GE_CHK_TRUE_EXEC_INFO(expr, exec_expr, ...) \
  {                                                 \
    const bool b = (expr);                          \
    if (b) {                                        \
      GELOGI(__VA_ARGS__);                          \
      exec_expr;                                    \
    }                                               \
  }

 // If expr is not SUCCESS, print the log and execute the expression + return
 #define GE_CHK_BOOL_TRUE_RET_VOID(expr, exec_expr, ...) \
  {                                                     \
    const bool b = (expr);                              \
    if (b) {                                            \
      GELOGE(ge::FAILED, __VA_ARGS__);                  \
      exec_expr;                                        \
      return;                                           \
    }                                                   \
  }

 // If expr is not SUCCESS, print the log and execute the expression + return _status
 #define GE_CHK_BOOL_TRUE_EXEC_RET_STATUS(expr, _status, exec_expr, ...) \
  {                                                                     \
    const bool b = (expr);                                              \
    if (b) {                                                            \
      REPORT_INNER_ERROR("E19999", __VA_ARGS__);                        \
      GELOGE(ge::FAILED, __VA_ARGS__);                                  \
      exec_expr;                                                        \
      return (_status);                                                 \
    }                                                                   \
  }

 // If expr is not true, execute a custom statement
 #define GE_CHK_BOOL_EXEC_NOLOG(expr, exec_expr) \
  {                                             \
    const bool b = (expr);                      \
    if (!b) {                                   \
      exec_expr;                                \
    }                                           \
  }

 // -----------------runtime related macro definitions-------------------------------
 // If expr is not RT_ERROR_NONE, print the log
 #define GE_CHK_RT(expr)                                                \
--- a/inc/framework/common/fmk_error_codes.h
+++ b/inc/framework/common/fmk_error_codes.h
@@ -42,9 +42,9 @@
 #include "register/register_error_codes.h"

 // Each module uses the following four macros to define error codes:
 #define DECLARE_ERRORNO_OMG(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OMG, (name), (value))
 #define DECLARE_ERRORNO_OME(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OME, (name), (value))
 #define DECLARE_ERRORNO_CALIBRATION(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_CALIBRATION, (name), (value))
 #define DECLARE_ERRORNO_OMG(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OMG, name, value)
 #define DECLARE_ERRORNO_OME(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OME, name, value)
 #define DECLARE_ERRORNO_CALIBRATION(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_CALIBRATION, name, value)

 #define DEF_ERRORNO(name, desc) const ErrorNoRegisterar g_##name##_errorno((name), (desc));

--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -88,11 +88,12 @@ constexpr uint64_t kInferSessionId = 0U;
 constexpr uint64_t kReleaseFlag = 1U;
 constexpr uint32_t kInvalidModelId = 0xFFFFFFFFU;
 constexpr size_t kNumTaskWithAtomicAddrCleanTask = 2U;
 constexpr uint32_t INVALID_MODEL_ID = 0xFFFFFFFFUL;

 // dynamic execute mode
 const char_t *const kLazyRecompile = "lazy_recompile";

 constexpr size_t kMaxHostMemInputLen = 64U;
 constexpr size_t kMaxHostMemInputLen = 128U;  // 64 aligned

 // Data cache, including data address and length
 struct DataBuffer {
@@ -239,6 +240,19 @@ struct ModelData {
  std::string om_name;         // om file name, used for data dump
 };

 struct ModelParam {
  ModelParam() : priority(0), mem_base(0U), mem_size(0U), weight_base(0U), weight_size(0U) {}
  ModelParam(const int32_t pri, const uintptr_t m_base, const size_t m_len, const uintptr_t w_base, const size_t w_len)
      : priority(pri), mem_base(m_base), mem_size(m_len), weight_base(w_base), weight_size(w_len) {}
  ~ModelParam() = default;

  int32_t priority;
  uintptr_t mem_base;
  size_t mem_size;
  uintptr_t weight_base;
  size_t weight_size;
 };

 // The definition of Model information
 struct ModelInfo {
  uint32_t version = 0U;
@@ -314,7 +328,7 @@ struct TaskDescInfo {
  std::vector<Format> output_format;
  std::vector<std::vector<int64_t>> output_shape;
  std::vector<DataType> output_data_type;
  uint32_t context_id;
  uint32_t context_id = 0xFFFFFFFFUL;
 };

 struct OpDescInfo {
--- a/inc/framework/common/helper/model_helper.h
+++ b/inc/framework/common/helper/model_helper.h
@@ -35,11 +35,11 @@ class GE_FUNC_VISIBILITY ModelHelper {
  Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param, const std::string &output_file,
                       ge::ModelBufferData &model) const;
  Status SaveToOmRootModel(const GeRootModelPtr &ge_root_model, const SaveParam &save_param,
                           const std::string &output_file, ModelBufferData &model, const bool is_unknown_shape);
  Status SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::string &output_file);
                           const std::string &output_file, ModelBufferData &model, const bool is_unknown_shape) const;
  Status SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::string &output_file) const;
  Status LoadModel(const ge::ModelData &model_data);
  Status LoadRootModel(const ge::ModelData &model_data);
  static void SetModelToGeModel(GeModelPtr &ge_model, Model &model);
  static void SetModelToGeModel(const GeModelPtr &ge_model, Model &model);

  GeModelPtr GetGeModel();
  GeRootModelPtr GetGeRootModel();
@@ -52,7 +52,7 @@ class GE_FUNC_VISIBILITY ModelHelper {
  }

  Status GetBaseNameFromFileName(const std::string &file_name, std::string &base_name) const;
  Status GetModelNameFromMergedGraphName(const std::string &graph_name, std::string &model_name) const;
  Status GetModelNameFromMergedGraphName(const ComputeGraphPtr &compute_graph, std::string &model_name) const;

 private:
  bool is_assign_model_ = false;
@@ -64,18 +64,21 @@ class GE_FUNC_VISIBILITY ModelHelper {

  ModelHelper(const ModelHelper &) = default;
  ModelHelper &operator=(const ModelHelper &) = default;
  Status GenerateGeModel(OmFileLoadHelper &om_load_helper);
  Status GenerateGeRootModel(OmFileLoadHelper &om_load_helper);
  Status LoadModelData(OmFileLoadHelper &om_load_helper);
  Status LoadModelData(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index) const;
  Status LoadWeights(OmFileLoadHelper &om_load_helper);
  Status LoadWeights(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index) const;
  Status LoadTask(OmFileLoadHelper &om_load_helper);
  Status LoadTask(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index) const;
  Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper);
  Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index) const;
  Status LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper);
  Status LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model,

  bool IsPartitionedGraph(const GeModelPtr &cur_model) const;

  Status GenerateGeModel(const OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, const size_t mode_index,
                         const bool is_dyn_root);
  Status GenerateGeRootModel(const OmFileLoadHelper &om_load_helper);

  Status LoadModelData(const OmFileLoadHelper &om_load_helper, const GeModelPtr &cur_model,
                       const size_t mode_index) const;
  Status LoadWeights(const OmFileLoadHelper &om_load_helper, const GeModelPtr &cur_model,
                     const size_t mode_index) const;
  Status LoadTask(const OmFileLoadHelper &om_load_helper, const GeModelPtr &cur_model, const size_t mode_index) const;
  Status LoadTBEKernelStore(const OmFileLoadHelper &om_load_helper, const GeModelPtr &cur_model,
                            const size_t mode_index) const;
  Status LoadCustAICPUKernelStore(const OmFileLoadHelper &om_load_helper, const GeModelPtr &cur_model,
                                  const size_t mode_index) const;

  Status SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper, const ModelPartitionType type,
--- a/inc/framework/common/helper/om_file_helper.h
+++ b/inc/framework/common/helper/om_file_helper.h
@@ -48,7 +48,7 @@ struct SaveParam {

 class GE_FUNC_VISIBILITY OmFileLoadHelper {
 public:
  Status Init(const ge::ModelData &model);
  Status Init(const ModelData &model);

  Status Init(uint8_t *const model_data, const uint32_t model_data_size);

@@ -56,16 +56,15 @@ class GE_FUNC_VISIBILITY OmFileLoadHelper {

  Status GetModelPartition(const ModelPartitionType type, ModelPartition &partition);

  Status GetModelPartition(const ModelPartitionType type, ModelPartition &partition, const size_t model_index);
  Status GetModelPartition(const ModelPartitionType type, ModelPartition &partition, const size_t model_index) const;

  OmFileContext context_;

  std::vector<OmFileContext> model_contexts_;

 private:
  Status CheckModelValid(const ge::ModelData &model) const;

  Status LoadModelPartitionTable(uint8_t *const model_data, const uint32_t model_data_size);
  Status LoadModelPartitionTable(uint8_t *const model_data, const uint32_t model_data_size, const size_t model_index,
                                 size_t &mem_offset);

  Status LoadModelPartitionTable(uint8_t *const model_data, const uint32_t model_data_size, const uint32_t model_num);

@@ -78,9 +77,7 @@ class GE_FUNC_VISIBILITY OmFileSaveHelper {
    return model_header_;
  }

  uint32_t GetModelDataSize() const {
    return context_.model_data_len_;
  }
  uint32_t GetModelDataSize() const;

  ModelPartitionTable *GetPartitionTable();

@@ -88,20 +85,19 @@ class GE_FUNC_VISIBILITY OmFileSaveHelper {

  Status AddPartition(const ModelPartition &partition, const size_t cur_index);

  Status SaveModel(const SaveParam &save_param, const char_t *const output_file, ge::ModelBufferData &model,
  Status SaveModel(const SaveParam &save_param, const char_t *const output_file, ModelBufferData &model,
                   const bool is_offline = true);

  Status SaveModelToFile(const char_t *const output_file, ge::ModelBufferData &model, const bool is_offline = true);

  std::vector<OmFileContext> model_contexts_;

  ModelFileHeader model_header_;
  OmFileContext context_;
  Status SaveModelToFile(const char_t *const output_file, ModelBufferData &model, const bool is_offline = true);

  ModelPartitionTable *GetPartitionTable(const size_t cur_ctx_index);

  Status SaveRootModel(const SaveParam &save_param, const char_t *const output_file, ModelBufferData &model,
                       const bool is_offline);

 private:
  ModelFileHeader model_header_;
  std::vector<OmFileContext> model_contexts_;
 };
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_HELPER_OM_FILE_HELPER_H_
--- a/inc/framework/common/profiling_definitions.h
+++ b/inc/framework/common/profiling_definitions.h
@@ -49,7 +49,12 @@ enum {
  kAtomic,
  kKernelLaunchPrepare,
  kRtKernelLaunch,
  kRtEventCreateRecord,
  kRtEventSync,
  kRtEventDestroy,
  kRtStreamSync,
  kOpExecute,
  kModelExecute,
  kAllocMem,
  kCopyH2D,
  kPrepareNode,
@@ -67,6 +72,33 @@ enum {
  kSelectBranch,
  kExecuteSubGraph,
  kInitSubGraphExecutor,
  // fuzz compile
  kSelectBin,
  kFindCompileCache,
  kAddCompileCache,
  kFuzzCompileOp,
  kCalcRuningParam,
  kGenTask,
  kRegisterBin,

  // FFTS Plus
  kFftsPlusPreThread,
  kFftsPlusNodeThread,
  kFftsPlusInferShape,
  kOpFftsCalculateV2,
  kInitThreadRunInfo,
  kFftsPlusGraphSchedule,
  kKnownGetAddrAndPrefCnt,
  kKernelGetAddrAndPrefCnt,
  kUpdateAddrAndPrefCnt,
  kInitOpRunInfo,
  kGetAutoThreadParam,
  kAllocateOutputs,
  kAllocateWorkspaces,
  kInitTaskAddrs,
  kInitThreadRunParam,
  kUpdateTaskAndCache,
  kFftsPlusTaskLaunch,

  // Add new definitions here
  kProfilingIndexEnd
@@ -88,7 +120,7 @@ class ProfilingContext {
   * 因此编译时注册字符串的动作并没有生效。在执行时，动态的打开了profiling，这种场景下，执行时无法拿到注册后字符串
   */
  bool IsEnabled() const noexcept {
    return enabled_ && profiler_ != nullptr;
    return enabled_ && (profiler_ != nullptr);
  }
  void SetEnable() noexcept {
    enabled_ = true;
@@ -184,5 +216,6 @@ class ScopeProfiler {
  ge::profiling::ProfilingContext::GetInstance().RecordCurrentThread((element), (event), \
                                                                     ge::profiling::EventType::kEventEnd)
 #define PROFILING_SCOPE(element, event) ge::profiling::ScopeProfiler profiler((element), (event))
 #define PROFILING_SCOPE_CONST(element, event) const ge::profiling::ScopeProfiler profiler((element), (event))
 #define PROFILING_SCOPE_ELEMENT(element) profiler.SetElement((element))
 #endif  // AIR_CXX_PROFILING_DEFINITIONS_H
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -17,13 +17,10 @@
 #ifndef INC_FRAMEWORK_COMMON_TYPES_H_
 #define INC_FRAMEWORK_COMMON_TYPES_H_

 #include <climits>
 #include <cstdint>
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>

 #include "framework/common/fmk_error_codes.h"
 #include "framework/common/fmk_types.h"
@@ -47,6 +44,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string PROFIL

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASKS;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_HOST_BASE_ADDR;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_HOST_MEMORY_SIZE;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const std::string MODEL_ATTR_FUSION_MODEL_DEF;
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY extern const uint64_t ALLOC_MEMORY_MAX_SIZE;  // Max size of 8 GB.
--- a/inc/framework/common/util.h
+++ b/inc/framework/common/util.h
@@ -231,17 +231,20 @@ constexpr int32_t OM_PROTO_VERSION = 2;
 /// @return string
 ///
 template <typename T>
 GE_FUNC_VISIBILITY std::string ToString(std::vector<T> &v) {
 GE_FUNC_VISIBILITY std::string ToString(const std::vector<T> &v) {
  bool first = true;
  std::stringstream ss;
  ss << "[";
  for (const T x : v) {
    ss << x;
    ss << ", ";
  for (const T &x : v) {
    if (first) {
      first = false;
      ss << x;
    } else {
      ss << ", " << x;
    }
  }
  // Delete the two extra characters at the end of the line.
  std::string str = ss.str().substr(0U, ss.str().length() - 2U);
  str += "]";
  return str;
  ss << "]";
  return ss.str();
 }

 ///
--- a/inc/framework/executor/ge_executor.h
+++ b/inc/framework/executor/ge_executor.h
@@ -27,7 +27,6 @@
 #include "framework/common/types.h"
 #include "graph/tensor.h"
 #include "graph/ge_tensor.h"
 #include "runtime/base.h"

 namespace ge {
 class SingleOp;
--- a/inc/framework/omg/omg.h
+++ b/inc/framework/omg/omg.h
@@ -17,10 +17,12 @@
 #ifndef INC_FRAMEWORK_OMG_OMG_H_
 #define INC_FRAMEWORK_OMG_OMG_H_

 #include <google/protobuf/message.h>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include <google/protobuf/message.h>
 #include "external/ge/ge_api_types.h"
 #include "framework/omg/omg_inner_types.h"
 #include "framework/omg/parser/parser_inner_ctx.h"
 #include "proto/ge_ir.pb.h"
@@ -31,20 +33,14 @@
 #include "graph/model.h"
 #include "runtime/kernel.h"

 using domi::Status;
 using std::pair;
 using std::string;
 using std::unordered_map;
 using std::vector;

 namespace ge {
 /**
 * @ingroup domi_omg
 * @brief init omg context
 * @return void
 */
 GE_FUNC_VISIBILITY Status InitDomiOmgContext(const std::string &input_shape, const std::string &input_format,
                                             const std::string &net_format, bool is_dynamic_input);
 GE_FUNC_VISIBILITY domi::Status InitDomiOmgContext(const std::string &input_shape, const std::string &input_format,
                                                   const std::string &net_format, bool is_dynamic_input);

 /**
 * @ingroup domi_omg
@@ -61,10 +57,10 @@ GE_FUNC_VISIBILITY Status InitDomiOmgContext(const std::string &input_shape, con
 * @param [in] atc_params multiply atc params
 * @return Status result code
 */
 GE_FUNC_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::map<std::string, std::string> &atc_params,
                                     const char *model_file, const char *weights_file, domi::FrameworkType type,
                                     const char *op_conf = nullptr, const char *target = nullptr,
                                     RunMode run_mode = RunMode::GEN_OM_MODEL, bool is_dynamic_input = false);
 GE_FUNC_VISIBILITY domi::Status ParseGraph(ge::Graph &graph, const std::map<std::string, std::string> &atc_params,
                                           const char *model_file, const char *weights_file, domi::FrameworkType type,
                                           const char *op_conf = nullptr, const char *target = nullptr,
                                           RunMode run_mode = RunMode::GEN_OM_MODEL, bool is_dynamic_input = false);

 /**
 * @ingroup domi_omg
@@ -74,9 +70,9 @@ GE_FUNC_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::map<std::strin
 * @param [key] encrypted key
 * @return Status result code
 */
 GE_FUNC_VISIBILITY Status ConvertOm(const char *model_file, const char *json_file, bool is_covert_to_json);
 GE_FUNC_VISIBILITY domi::Status ConvertOm(const char *model_file, const char *json_file, bool is_covert_to_json);

 GE_FUNC_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const char *json_file);
 GE_FUNC_VISIBILITY domi::Status ConvertPbtxtToJson(const char *model_file, const char *json_file);
 /**
 * @ingroup domi_omg
 * @brief convert the model file in protobuf format into a JSON file.
@@ -86,21 +82,21 @@ GE_FUNC_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const char
 * @param [key] encrypted key
 * @return Status result code
 */
 GE_FUNC_VISIBILITY Status ConvertFwkModelToJson(domi::FrameworkType framework, const char *model_file,
                                                const char *json_file);
 GE_FUNC_VISIBILITY domi::Status ConvertFwkModelToJson(const domi::FrameworkType framework, const char *model_file,
                                                      const char *json_file);

 GE_FUNC_VISIBILITY void GetGroupName(ge::proto::ModelDef &model_def);

 GE_FUNC_VISIBILITY void FindParserSo(const std::string &path, std::vector<std::string> &file_list,
                                     std::string &caffe_parser_path);

 GE_FUNC_VISIBILITY Status DumpInfershapeJson(const ge::Graph &graph, const char *json_file);
 GE_FUNC_VISIBILITY domi::Status DumpInfershapeJson(const ge::Graph &graph, const char *json_file);

 GE_FUNC_VISIBILITY Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type,
                                            const std::string &output);
 GE_FUNC_VISIBILITY domi::Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type,
                                                  const std::string &output);

 GE_FUNC_VISIBILITY Status GetOutputLeaf(ge::NodePtr node,
                                        std::vector<std::pair<ge::NodePtr, int32_t>> &output_nodes_info);
 GE_FUNC_VISIBILITY domi::Status GetOutputLeaf(ge::NodePtr node,
                                              std::vector<std::pair<ge::NodePtr, int32_t>> &output_nodes_info);

 GE_FUNC_VISIBILITY void CreateOutputNodesInfo(std::vector<std::pair<ge::NodePtr, int32_t>> &output_nodes_info,
                                              std::vector<std::string> &output_nodes_name);
--- a/inc/framework/pne/pne_model.h
+++ b/inc/framework/pne/pne_model.h
@@ -0,0 +1,128 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_PNE_MODEL_H_
 #define INC_FRAMEWORK_PNE_MODEL_H_

 #include <map>
 #include <string>
 #include <vector>

 #include "graph/compute_graph.h"
 #include "framework/common/debug/log.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "framework/engine/dnnengine.h"
 #include "external/ge/ge_ir_build.h"

 namespace ge {
 const std::string PNE_ID_NPU = "NPU";
 const std::string PNE_ID_CPU = "HOST_CPU";

 struct ModelRelation;
 class PneModel {
 public:
  PneModel() = default;
  explicit PneModel(const ComputeGraphPtr &root_graph) : root_graph_(root_graph){};
  virtual ~PneModel() = default;
  PneModel(const PneModel &other) = delete;
  PneModel &operator=(const PneModel &other) = delete;

 public:
  inline Status AddSubModel(const shared_ptr<PneModel> &submodel, std::string type = "") {
    if (submodel == nullptr) {
      GELOGE(INTERNAL_ERROR, "submodel is nullptr, type = %s", type.c_str());
      return INTERNAL_ERROR;
    }
    submodel->SetModelType(type);
    if (!submodels_.emplace(submodel->GetModelName(), submodel).second) {
      GELOGE(INTERNAL_ERROR, "submodel already exist, name = %s, type = %s", submodel->GetModelName().c_str(),
             type.c_str());
      return INTERNAL_ERROR;
    }
    return SUCCESS;
  }

  inline const std::shared_ptr<PneModel> GetSubmodel(const std::string &name) const {
    const auto &it = submodels_.find(name);
    if (it == submodels_.end()) {
      return nullptr;
    }
    return it->second;
  }

  inline const std::map<std::string, std::shared_ptr<PneModel>> &GetSubmodels() const {
    return submodels_;
  }

  inline void SetModelType(const std::string &type) {
    model_type_ = type;
  }

  inline const std::string &GetModelType() const {
    return model_type_;
  }

  inline void SetModelName(const std::string &model_name) {
    model_name_ = model_name;
  }

  inline const std::string &GetModelName() const {
    return model_name_;
  }

  inline void SetRootGraph(const ComputeGraphPtr graph) {
    root_graph_ = graph;
  }

  inline const ComputeGraphPtr &GetRootGraph() const {
    return root_graph_;
  }

  inline void SetModelRelation(std::shared_ptr<ModelRelation> model_relation) {
    model_relation_ = std::move(model_relation);
  }

  inline const std::shared_ptr<ModelRelation> GetModelRelation() const {
    return model_relation_;
  }

 public:
  virtual Status SerializeModel(ModelBufferData &model_buff) = 0;

  virtual Status UnSerializeModel(const ModelBufferData &model_buff) = 0;

  virtual void SetModelId(const uint32_t model_id) {
    model_id_ = model_id;
  }

  virtual uint32_t GetModelId() const {
    return model_id_;
  }

 private:
  std::map<std::string, std::shared_ptr<PneModel>> submodels_;
  std::shared_ptr<ModelRelation> model_relation_;
  ComputeGraphPtr root_graph_ = nullptr;
  std::string model_name_;
  std::string model_type_;
  uint32_t model_id_ = INVALID_MODEL_ID;
 };

 using PneModelPtr = std::shared_ptr<PneModel>;
 }  // namespace ge

 #endif  // INC_FRAMEWORK_PNE_MODEL_H_
--- a/inc/framework/pne/process_node_engine.h
+++ b/inc/framework/pne/process_node_engine.h
@@ -0,0 +1,55 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_PROCESS_NODE_ENGINE_H_
 #define INC_FRAMEWORK_PROCESS_NODE_ENGINE_H_

 #include <map>
 #include <string>
 #include <vector>

 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "graph/manager/graph_manager_utils.h"
 #include "framework/pne/pne_model.h"

 namespace ge {
 class ProcessNodeEngine {
 public:
  ProcessNodeEngine() = default;
  virtual ~ProcessNodeEngine() = default;
  ProcessNodeEngine(const ProcessNodeEngine &other) = delete;
  ProcessNodeEngine &operator=(const ProcessNodeEngine &other) = delete;

 public:
  virtual Status Initialize(const std::map<std::string, std::string> &options) = 0;

  virtual Status Finalize() = 0;

  virtual Status OptimizeGraph(const std::vector<GeTensor> &inputs, ComputeGraphPtr &compute_graph) = 0;

  virtual Status BuildGraph(ComputeGraphPtr &compute_graph, PneModelPtr &model) = 0;

  virtual const std::string &GetEngineName(const ge::NodePtr &node_ptr = nullptr) const = 0;

 protected:
  std::string engine_id_;
 };

 using ProcessNodeEnginePtr = std::shared_ptr<ProcessNodeEngine>;
 }  // namespace ge

 #endif  // INC_FRAMEWORK_PROCESS_NODE_ENGINE_H_
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit ab3207e99f94aabf036e1c8b068de0df15ff2d01
 Subproject commit f3e9df35da67ff00a22a09ec5b369bbc4bac9e74
--- a/third_party/fwkacllib/inc/aicpu/common/aicpu_task_struct.h
+++ b/third_party/fwkacllib/inc/aicpu/common/aicpu_task_struct.h
@@ -1,84 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef AICPU_TASK_STRUCT_H_
 #define AICPU_TASK_STRUCT_H_

 #include <cstdint>

 namespace aicpu {

 #pragma pack(push, 1)
 struct AicpuParamHead
 {
    uint32_t        length;                    // Total length: include cunstom message
    uint32_t        ioAddrNum;                 // Input and output address number
    uint32_t        extInfoLength;             // extInfo struct Length
    uint64_t        extInfoAddr;               // extInfo address
 };

 enum class AicpuConfigMsgType {
    AICPU_CONFIG_MSG_TYPE_BUF_FREE      = 0,  /* free buf */
    AICPU_CONFIG_MSG_TYPE_BUF_RESET     = 1,  /* reset buf */
    AICPU_CONFIG_MSG_TYPE_BUF_SET_ADDR  = 2,  /* set buf addr to aicpu */
 };

 enum class AicpuErrMsgType {
    ERR_MSG_TYPE_NULL   = 0,
    ERR_MSG_TYPE_AICORE = 1,
    ERR_MSG_TYPE_AICPU  = 2,
 };

 typedef struct tagAicpuConfigMsg {
    uint8_t msgType;
    uint8_t reserved1;
    uint16_t bufLen;
    uint32_t offset;
    uint64_t bufAddr;
    uint32_t tsId;
    uint32_t reserved2;
 } AicpuConfigMsg;

 typedef struct tagAicoreErrMsgInfo {
    uint8_t errType;
    uint8_t version;
    uint8_t reserved1[2];    /* reserved1, 4 byte alignment */
    uint32_t errorCode;
    uint32_t modelId;
    uint32_t taskId;
    uint32_t streamId;
    uint64_t transactionId;
    uint8_t reserved2[228];  /* the total byte is 256, reserved2 len = 256 - other lens */
 } AicoreErrMsgInfo;

 typedef struct tagAicpuErrMsgInfo {
    uint8_t errType;
    uint8_t version;
    uint8_t reserved1[2];    /* reserved1, 4 byte alignment */
    uint32_t errorCode;
    uint32_t modelId;
    uint32_t streamId;
    uint64_t transactionId;
    char opName[64];        /* op name str */
    char errDesc[128];      /* err msg desc info */
    uint8_t reserved2[40];  /* the total byte is 256, reserved2 len = 256 - other lens */
 } AicpuErrMsgInfo;
 #pragma pack(pop)

 }  // namespace aicpu

 #endif  // AICPU_TASK_STRUCT_H_

--- a/third_party/fwkacllib/inc/common/type_def.h
+++ b/third_party/fwkacllib/inc/common/type_def.h
@@ -1,52 +0,0 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * Description:interface.
 * Create: 2021-12-21
 */
 #ifndef AICPU_TYPE_DEF_H
 #define AICPU_TYPE_DEF_H

 #include <cstdint>
 #include <cstddef>
 #ifndef char_t
 typedef char char_t;
 #endif

 #ifndef float32_t
 typedef float float32_t;
 #endif

 #ifndef float64_t
 typedef double float64_t;
 #endif

 inline uint64_t PtrToValue(const void *ptr)
 {
    return static_cast<const uint64_t>(reinterpret_cast<const uintptr_t>(ptr));
 }

 inline void *ValueToPtr(const uint64_t value)
 {
    return reinterpret_cast<void *>(static_cast<const uintptr_t>(value));
 }

 template<typename TI, typename TO>
 inline TO *PtrToPtr(TI *ptr)
 {
    return reinterpret_cast<TO *>(ptr);
 }

 template<typename T>
 inline T *PtrAdd(T * const ptr, const size_t maxIdx, const size_t idx)
 {
    if ((ptr != nullptr) && (idx < maxIdx)) {
        return reinterpret_cast<T *>(ptr + idx);
    }
    return nullptr;
 }
 #endif  // AICPU_TYPE_DEF_H
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -197,6 +197,20 @@ typedef struct tagCommAttr {
    WorkMode mode;  // 通信域内的probe工作模式
    uint32_t deviceId = 0;
 } CommAttr;

 typedef void* HcclMessage;
 typedef void* HcclRequest;

 typedef struct {
    int srcRank;    // 接收/探测到的msg/信封的发送端rank_id，MPI标准定义，调用者可以访问
    int tag;        // 接收/探测到的msg/信封的tag，MPI标准定义，调用者可以访问
    int error;      // 接收/探测的错误码0：no error，others：传输过程出错，MPI标准定义，调用者可以访问
    int cancelled;  // 指定实现，不建议调用者访问
    int count;      // 接收/探测到的payload大小，指定实现，不建议调用者访问
 } HcclStatus;

 #define HCCL_REQUEST_NULL   NULL

 #ifdef __cplusplus
 }
 #endif // __cplusplus
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -501,6 +501,7 @@ REG_OP(Constant)
 *@brief Creates a file constant tensor, The operator is used to process the very large weight which is store in file. \n

 *@par Attributes:
 *file_path: A string, used to record file path. \n
 *file_id: A string, used to record file id. \n
 *shape: data shape. \n
 *dtype: data type. \n
@@ -511,7 +512,8 @@ REG_OP(Constant)
 REG_OP(FileConstant)
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \
        DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE}))
    .REQUIRED_ATTR(file_id, String)
    .ATTR(file_path, String, "")
    .ATTR(file_id, String, "")
    .REQUIRED_ATTR(shape, ListInt)
    .REQUIRED_ATTR(dtype, Type)
    .OP_END_FACTORY_REG(FileConstant)
@@ -1205,6 +1207,39 @@ REG_OP(Copy)
    .REQUIRED_ATTR(N, Int)
    .OP_END_FACTORY_REG(Copy);

 /**
 *@brief copy the src tensor to the dst tensor according the special parameter . \n

 *@par Inputs:
 *Eight inputs, including:
 *dst: A tensor. Must be one of the following types:
 * double, float32, float16, int8, uint8, int16, uint16, int32, uint32, int64, uint64, bool
 *dst_size: A tensor with type int32
 *dst_stride: A tensor with type int32
 *dst_storage_offset: A tensor with type int32
 *src: A tensor. Must be one of the following types:
 * double, float32, float16, int8, uint8, int16, uint16, int32, uint32, int64, uint64, bool
 *src_size: A tensor with type int32
 *src_stride: A tensor with type int32
 *src_storage_offset: the storage_offset of src tensor . \n

 *@par Outputs:
 *dst: An ref tensor.Must be one of the following types:
 * double, float32, float16, int8, uint8, int16, uint16, int32, uint32, int64, uint64, bool . \n
 */

 REG_OP(ViewCopy)
    .INPUT(dst, TensorType::BasicType())
    .INPUT(dst_size, TensorType::IndexNumberType())
    .INPUT(dst_stride, TensorType::IndexNumberType())
    .INPUT(dst_storage_offset, TensorType::IndexNumberType())
    .INPUT(src, TensorType::BasicType())
    .INPUT(src_size, TensorType::IndexNumberType())
    .INPUT(src_stride, TensorType::IndexNumberType())
    .INPUT(src_storage_offset, TensorType::IndexNumberType())
    .OUTPUT(dst, TensorType::BasicType())
    .OP_END_FACTORY_REG(ViewCopy)

 /**
 *@brief Generates fingerprint values. \n

--- a/third_party/fwkacllib/inc/ops/avg_pool_1d_ops.h
+++ b/third_party/fwkacllib/inc/ops/avg_pool_1d_ops.h
@@ -28,7 +28,7 @@ namespace ge {

 *@par Inputs:
 * @li x: A tensor. Must be one of the following types:uint8, int8,int16, int32,
 int64, float16, float, double.The format must be NHWC NCHW NC1HWC0.
 int64, float16, float, double.The format must be NHWC/NCHW.

 *@par Attributes:
 *@li ksize: Kernel size. Input type is int.
--- a/third_party/fwkacllib/inc/ops/data_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h
@@ -2261,6 +2261,33 @@ REG_OP(OutfeedEnqueueOp)
  .ATTR(channel_name, String, "")
  .OP_END_FACTORY_REG(OutfeedEnqueueOp)

 /**
 *@brief Enqueue a Tensor on the computation outfeed. \n

 *@par Inputs:
 *Inputs include:
 *x: A Tensor. Must be one of the following types: float16, float32,
 float64, int8, int16, uint16, uint8, int32, int64, uint32, uint64,
 bool, double, string. It's a dynamic input. \n
 *tensor_name: A Tensor. Must be string types. \n

 *@par Attributes:
 *channel_name: name of operator channel, default "". \n

 *@attention Constraints:
 *The implementation for OutfeedEnqueueOpV2 on Ascend uses AICPU, with bad performance.

 *@par Third-party framework compatibility
 *@li compatible with tensorflow OutfeedEnqueueOpV2 operator.
 */
 REG_OP(OutfeedEnqueueOpV2)
  .DYNAMIC_INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8,
      DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32,
      DT_UINT64, DT_BOOL, DT_DOUBLE, DT_STRING}))
  .INPUT(tensor_name, TensorType({DT_STRING}))
  .ATTR(channel_name, String, "")
  .OP_END_FACTORY_REG(OutfeedEnqueueOpV2)

 /**
 *@brief LruCache, create cache resource.
 *@par Inputs:
@@ -2478,5 +2505,24 @@ REG_OP(GetNextFromQueue)
  .ATTR(output_types, ListType, {})
  .ATTR(output_shapes, ListListInt, {{}, {}})
  .OP_END_FACTORY_REG(GetNextFromQueue)

 /**
 * @brief OptionalGetValue
 * @par Inputs:
 * optional: A tensor of type variant
 * @par Outputs:
 * components: A list of Tensor objects of output_types
 * @par Attributes:
 * output_types: types of all outputs
 * output_shapes: shapes of all outputs
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(OptionalGetValue)
  .INPUT(optional, TensorType({DT_VARIANT}))
  .DYNAMIC_OUTPUT(components, TensorType::BasicType())
  .REQUIRED_ATTR(output_types, ListType)
  .REQUIRED_ATTR(output_shapes, ListListInt)
  .OP_END_FACTORY_REG(OptionalGetValue)
 } // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_DATA_FLOW_OPS_H_
--- a/third_party/fwkacllib/inc/ops/deep_md.h
+++ b/third_party/fwkacllib/inc/ops/deep_md.h
@@ -54,8 +54,6 @@ REG_OP(TabulateFusion)
    .INPUT(em, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(descriptor, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .REQUIRED_ATTR(last_layer_size, Int)
    .ATTR(split_count, Int, 1)
    .ATTR(split_index, Int, 0)
    .OP_END_FACTORY_REG(TabulateFusion)

 /**
@@ -102,9 +100,105 @@ REG_OP(ProdEnvMatA)
    .ATTR(rcut_r_smth, Float, 1.0)
    .ATTR(sel_a, ListInt, {})
    .ATTR(sel_r, ListInt, {})
    .ATTR(split_count, Int, 1)
    .ATTR(split_index, Int, 0)
    .OP_END_FACTORY_REG(ProdEnvMatA)

 /**
 * @brief Calculate ProdEnvMatACalRij. 
 * Use type, natoms, sel_a, and rcut_r as constraints, find the central element in
 * the corresponding coord through mesh, output the index of the central element 
 * and the distance between the central element and each neighbor. \n
 *
 * @par Inputs:
 * @li coord: A Tensor. Must be one of the following types: float32, float64.
 * @li type: A Tensor. Must be one of the following types: int32.
 * @li natoms: A Tensor. Must be one of the following types: int32.
 * @li box: A Tensor. Must be one of the following types: float32, float64.
 * @li mesh: A Tensor. Must be one of the following types: int32. 
 *
 * @par Outputs:
 * rij: A Tensor. Must be one of the following types: float32, float64.
 * nlist: A Tensor. Must be one of the following types: int32.
 * distance: A Tensor. Must be one of the following types: float32, float64.
 * rij_x: A Tensor. Must be one of the following types: float32, float64.
 * rij_y: A Tensor. Must be one of the following types: float32, float64.
 * rij_z: A Tensor. Must be one of the following types: float32, float64. \n
 *
 * @par Attributes:
 * @li rcut_a: A Float.
 * @li rcut_r: A Float.
 * @li rcut_r_smth: A Float.
 * @li sel_a: A ListInt.
 * @li sel_r: A ListInt. \n
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ProdEnvMatACalcRij)
    .INPUT(coord, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(type, TensorType({DT_INT32}))
    .INPUT(natoms, TensorType({DT_INT32}))
    .INPUT(box, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(mesh, TensorType({DT_INT32}))
    .OUTPUT(rij, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(nlist, TensorType({DT_INT32}))
    .OUTPUT(distance, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(rij_x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(rij_y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(rij_z, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(rcut_a, Float, 1.0)
    .ATTR(rcut_r, Float, 1.0)
    .ATTR(rcut_r_smth, Float, 1.0)
    .ATTR(sel_a, ListInt, {})
    .ATTR(sel_r, ListInt, {})
    .OP_END_FACTORY_REG(ProdEnvMatACalcRij)

 /**
 * @brief Calculate ProdEnvMatACalcDescrpt. \n
 *
 * @par Inputs:
 * @li distance: A Tensor. Must be one of the following types: float32, float64.
 * @li rij_x: A Tensor. Must be one of the following types: float32, float64.
 * @li rij_y: A Tensor. Must be one of the following types: float32, float64.
 * @li rij_z: A Tensor. Must be one of the following types: float32, float64.
 * @li type: A Tensor. Must be one of the following types: int32.
 * @li natoms: A Tensor. Must be one of the following types: int32.
 * @li mesh: A Tensor. Must be one of the following types: int32.
 * @li davg: A Tensor. Must be one of the following types: float32, float64.
 * @li dstd: A Tensor. Must be one of the following types: float32, float64. \n
 *
 * @par Outputs:
 * @li descrpt: A Tensor. Must be one of the following types: float32, float64.
 * @li descrpt_deriv: A Tensor. Must be one of the following types: float32, float64. \n
 *
 * @par Attributes:
 * @li rcut_a: A Float.
 * @li rcut_r: A Float.
 * @li rcut_r_smth: A Float.
 * @li sel_a: A ListInt.
 * @li sel_r: A ListInt. \n
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ProdEnvMatACalcDescrpt)
    .INPUT(distance, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rij_x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rij_y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rij_z, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(type, TensorType({DT_INT32}))
    .INPUT(natoms, TensorType({DT_INT32}))
    .INPUT(mesh, TensorType({DT_INT32}))
    .INPUT(davg, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(dstd, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(descrpt, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(descrpt_deriv, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(rcut_a, Float, 1.0)
    .ATTR(rcut_r, Float, 1.0)
    .ATTR(rcut_r_smth, Float, 1.0)
    .ATTR(sel_a, ListInt, {})
    .ATTR(sel_r, ListInt, {})
    .OP_END_FACTORY_REG(ProdEnvMatACalcDescrpt)

 /**
 * @brief Calculate ProdForceSeA. \n
 *
@@ -134,8 +228,6 @@ REG_OP(ProdForceSeA)
    .OUTPUT(atom_force, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .REQUIRED_ATTR(n_a_sel, Int)
    .REQUIRED_ATTR(n_r_sel, Int)
    .ATTR(split_count, Int, 1)
    .ATTR(split_index, Int, 0)
    .OP_END_FACTORY_REG(ProdForceSeA)

 /**
@@ -171,8 +263,6 @@ REG_OP(ProdVirialSeA)
    .OUTPUT(atom_virial, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .REQUIRED_ATTR(n_a_sel, Int)
    .REQUIRED_ATTR(n_r_sel, Int)
    .ATTR(split_count, Int, 1)
    .ATTR(split_index, Int, 0)
    .OP_END_FACTORY_REG(ProdVirialSeA)

 /**
@@ -195,6 +285,9 @@ REG_OP(ProdVirialSeA)
 * Two attributes, including:
 * @li split_count: A Scalar. 
 * @li split_index: A Scalar. \n
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(TabulateFusionGrad)
  .INPUT(table, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
@@ -205,8 +298,6 @@ REG_OP(TabulateFusionGrad)
  .INPUT(descriptor, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
  .OUTPUT(dy_dem_x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
  .OUTPUT(dy_dem, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
  .ATTR(split_count, Int, 1)
  .ATTR(split_index, Int, 0)
  .OP_END_FACTORY_REG(TabulateFusionGrad)
 } // namespace ge

--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -286,7 +286,7 @@ REG_OP(Minimum)
 *@par Inputs:
 *One inputs, include:
 *x:A Tensor of type float16, float32, int32, int64, double,
 *     complex64, complex128.the format can be [NCHW,NC1HWC0,NHWC,ND,NCHW,NC1HWC0,NHWC,ND]
 *     complex64, complex128.the format can be [NCHW,NHWC,ND]

 *@par Outputs:
 *y:A Tensor with same type as "x". \n
@@ -418,7 +418,7 @@ REG_OP(SquaredDifference)

 *@par Inputs:
 *x: A Tensor of type float16, float32, double, complex64, complex128.
 * the format can be [NCHW,NC1HWC0,NHWC,ND]
 * the format can be [NCHW,NHWC,ND]

 *@par Outputs:
 *y: A Tensor of the same type as "x". \n
@@ -439,7 +439,7 @@ REG_OP(Cos)
 * Two inputs, including:
 *@li x1: A Tensor. Must be one of the following types:
 *    float16, float32, int32, int8, uint8, float64, int64, uint16, int16,
 *    complex64, complex128, the format can be [NCHW,NC1HWC0,NHWC,ND].
 *    complex64, complex128, the format can be [NCHW,NHWC,ND].
 *@li x2: A Tensor. Has the same type and format as input "x1". \n

 *@par Outputs:
@@ -468,7 +468,7 @@ REG_OP(Div)
 *@li x1: A Tensor. Must be one of the following types:
 *    float16, float32, int32, int8, uint8, double, int16, int64, complex64,
 *    complex128, quint8, qint8, qint32, string, bool. the format can be
 *    [NCHW, NC1HWC0, NHWC, ND]
 *    [NCHW, NHWC, ND]
 *@li x2: A Tensor of the same type and format as "x1". \n

 *@par Outputs:
@@ -533,6 +533,24 @@ REG_OP(Expm1)
    .OUTPUT(y, TensorType::UnaryDataType())
    .OP_END_FACTORY_REG(Expm1)

 /**
 * @brief Computes the expint(x). \n

 * @par Inputs:
 * One input:
 * x: A Tensor. Must be one of the following types: bfloat16, half, float32, double. \n

 * @par Outputs:
 * y: A Tensor of the same type as "x". \n

 * @par Third-party framework compatibility
 * Compatible with TensorFlow operator Expint.
 */
 REG_OP(Expint)
    .INPUT(x, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(Expint)

 /**
 *@brief: Computes the reciprocal of "x". \n

@@ -665,6 +683,24 @@ REG_OP(NotEqual)
    .OUTPUT(y, TensorType({DT_BOOL}))
    .OP_END_FACTORY_REG(NotEqual)

 /**
 * @brief Computes ndtri element-wise (y = sqrt(2) * erfinv(2 * x - 1))

 * @par Inputs:
 * One input:
 * x: A Tensor. Must be one of the following types: bfloat16, float16, float32, double \n

 * @par Outputs:
 * y: A Tensor. Has the same type and format as input "x". \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator Ndtri.
 */
 REG_OP(Ndtri)
    .INPUT(x, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(Ndtri)

 /**
 *@brief Computes numerical negative value element-wise (y = -x)

@@ -740,6 +776,31 @@ REG_OP(Xdivy)
                           DT_COMPLEX128}))
    .OP_END_FACTORY_REG(Xdivy)

 /**
 * @brief Computes "x" multiplied by the logarithm of y element-wise,
 * if "x" == 0, return "0". \n

 * @par Inputs:
 * Two inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32,
 * double, complex64, complex128.
 * @li y: A Tensor. Has the same type as "x". \n

 * @par Outputs:
 * z: A Tensor. Has the same type as "x". \n

 * @par Third-party framework compatibility
 * Compatible with TensorFlow operator Xlog1py.
 */
 REG_OP(Xlog1py)
    .INPUT(x, TensorType({DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
                          DT_COMPLEX128}))
    .INPUT(y, TensorType({DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
                          DT_COMPLEX128}))
    .OUTPUT(z, TensorType({DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
                           DT_COMPLEX128}))
    .OP_END_FACTORY_REG(Xlog1py)

 /**
 *@brief Computes "x" multiplied by the logarithm of y element-wise,
 * if "x" == 0, return "0". \n
@@ -970,6 +1031,25 @@ REG_OP(LogicalOr)
    .OUTPUT(y, TensorType({DT_BOOL}))
    .OP_END_FACTORY_REG(LogicalOr)

 /**
 * @brief Computes spence of x element-wise. \n

 *
 * @par Inputs:
 *  x: A tensor. Must be one of the following types: bfloat16, float16, float32, double.
 *
 * @par Outputs:
 *  y: A tensor. Has the same type as "x".
 *
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator Spence.
 *
 */
 REG_OP(Spence)
    .INPUT(x, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(Spence)

 /**
 *@brief Returns the truth value of x1 AND x2 element-wise. \n

@@ -1176,6 +1256,31 @@ REG_OP(FusedMulAdd)
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .OP_END_FACTORY_REG(FusedMulAdd)

 /**
 *@brief Confuse mul+add+add with broadcast. \n

 *@par Inputs:
 *Four inputs, including:
 * @li x1: A Tensor. Must be one of the following types:int32, float16, float32.
 * @li x2: A Tensor of the same type as "x1".
 * @li x3: A Tensor of the same type as "x1".
 * @li x4: A Tensor of the same type as "x1". \n

 *@par Outputs:
 * y: A Tensor. Has the same type as "x1". \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */

 REG_OP(FusedMulAddAdd)
    .INPUT(x1, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .INPUT(x2, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .INPUT(x3, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .INPUT(x4, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .OP_END_FACTORY_REG(FusedMulAddAdd)
 	
 /**
 *@brief Returns x1 + x2 element-wise. \n

@@ -1299,7 +1404,7 @@ REG_OP(AssignSub)

 *@par Inputs:
 * Two inputs, including:
 *@li y: An NCHW, NC1HWC0, NHWC, ND Tensor. Must be one of the following types: \
 *@li y: An NCHW, NHWC, ND Tensor. Must be one of the following types: \
 * float, int32, int8, double, complex64, complex128, half.
 *@li dy: A Tensor of the same type and format as "y". \n

@@ -1321,11 +1426,11 @@ REG_OP(RsqrtGrad)
 *@brief Computes hyperbolic sine of "x" element-wise. \n

 *@par Inputs:
 *x: An NCHW, NC1HWC0, NHWC,or ND Tensor of type float, double, complex64,
 *x: An NCHW, NHWC,or ND Tensor of type float, double, complex64,
 * complex128, half. \n

 *@par Outputs:
 *y: A NCHW, NC1HWC0, NHWC,or ND Tensor of type float, double, complex64,
 *y: A NCHW, NHWC,or ND Tensor of type float, double, complex64,
 * complex128, half. \n

 *@par Third-party framework compatibility
@@ -1365,7 +1470,7 @@ REG_OP(ClipByValue)

 *@par Inputs:
 *x: A Tensor of type float16, float32, double, complex64, complex128.
 * the format can be [NCHW,NC1HWC0,NHWC,ND]. \n
 * the format can be [NCHW,NHWC,ND]. \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x". \n
@@ -1385,7 +1490,7 @@ REG_OP(Cosh)
 *@par Inputs:
 * Two inputs, including:
 *@li x1: A Tensor. Must be one of the following types:float16, float32, int32,
 *    int8, uint8, double, the format can be [NCHW,NC1HWC0,NHWC,ND].
 *    int8, uint8, double, the format can be [NCHW,NHWC,ND].
 *@li x2: A Tensor of the same type as "x1". \n

 *@par Outputs:
@@ -1410,7 +1515,7 @@ REG_OP(DivNoNan)
 * One input: \n
 *x: A Tensor, Must be one of the following types:
 *    int32, uint8, int16, int8, int64, int64, uint16, uint32, uint64,
 *    and format can be [NCHW,NC1HWC0,NHWC,ND]
 *    and format can be [NCHW,NHWC,ND]

 *@par Outputs:
 *y: A Tensor. Has the same type and format as "x"
@@ -1662,6 +1767,44 @@ REG_OP(Atan2)
    .OUTPUT(y, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(Atan2)

 /**
 * @brief Computes fresnel_cos of x element-wise. \n
 
 * 
 * @par Inputs:
 *  x: A tensor. Must be one of the following types: bfloat16, float16, float32, double.
 * 
 * @par Outputs:
 *  y: A tensor. Has the same type as "x".
 * 
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator FresnelCos.
 * 
 */
 REG_OP(FresnelCos)
    .INPUT(x, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(FresnelCos)

 /**
 * @brief Computes fresnel_sin of x element-wise. \n
 
 * 
 * @par Inputs:
 *  x: A tensor. Must be one of the following types: bfloat16, float16, float32, double.
 * 
 * @par Outputs:
 *  y: A tensor. Has the same type as "x".
 * 
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator FresnelSin.
 * 
 */
 REG_OP(FresnelSin)
    .INPUT(x, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(FresnelSin)

 /**
 *@brief Returns the truth value of abs(x1-x2) < tolerance element-wise. \n

@@ -1978,7 +2121,7 @@ REG_OP(BitwiseOr)
 *@par Inputs:
 *Two inputs, including:
 *@li x1: A Tensor. Must be one of the following types: int8, int16, int32, int64, uint8, uint16, uint32, uint64.
 *       The format is NC1HWC0 or ND. Broadcasting is supported.
 *       The format is ND. Broadcasting is supported.
 *@li x2: A Tensor. Has the same type and format as "x1". \n

 *@par Outputs:
@@ -3463,12 +3606,12 @@ REG_OP(Addcmul)
 REG_OP(AxpyV2)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(alpha, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(alpha, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OP_END_FACTORY_REG(AxpyV2)

 /**
 * @brief Add the partial values of two tensors in format NC1HWC0.
 * @brief Add the partial values of two tensors.

 * @par Inputs:
 * @li x1: A Tensor in 5HD, and must be one of the following types: float16,
@@ -3852,6 +3995,25 @@ REG_OP(ApplyAdamV2)
    .OUTPUT(v, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .ATTR(adam_mode, String, "adam")
    .OP_END_FACTORY_REG(ApplyAdamV2)

 /**
 * @brief Computes Dawsn operation.  \n

 *
 * @par Inputs:
 * x: A tensor. Must be one of the following types: bfloat16, float16, float32, float64.
 *
 * @par Outputs:
 * y: A tensor. Has the same type as "x".
 *
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator Dawsn.
 *
 */
 REG_OP(Dawsn)
    .INPUT(x, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_BF16, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(Dawsn)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_ELEWISE_CALCULATION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -671,6 +671,7 @@ size for the images . \n
 output tensors are aligned, preserving the values at the corner pixels.
 Defaults to false .
 * @li half_pixel_centers: An optional bool. Defaults to False . \n
 * @li dtype: An Type attr, support type list [DT_FP32, DT_U8]. Defaults to DT_FP32 . \n
 *@par Outputs:
 *y: 4-D with shape [batch, new_height, new_width, channels] . \n

@@ -682,12 +683,13 @@ Defaults to false .
 */

 REG_OP(ResizeBilinearV2)
    .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16,
                               DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32,
                          DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(size, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_UINT8, DT_FLOAT}))
    .ATTR(align_corners, Bool, false)
    .ATTR(half_pixel_centers, Bool, false)
    .ATTR(dtype, Type, DT_FLOAT)
    .OP_END_FACTORY_REG(ResizeBilinearV2)

 /**
@@ -1267,7 +1269,7 @@ REG_OP(DecodeAndCropJpeg)

 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor.
 *x: A Tensor.
 * Must be one of the following types: float16, float32 . \n

 *@par Attributes:
@@ -1304,7 +1306,7 @@ REG_OP(ResizeBilinearV2D)

 *@par Inputs:
 * One input:
 *images: An NC1HWC0 Tensor.
 *images: A Tensor.
 * Must be one of the following types: float16, float32 . \n

 *@par Attributes:
@@ -1338,7 +1340,7 @@ REG_OP(KeepRatioResizeBilinear)

 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor.
 *x: A Tensor.
 * Must be one of the following types: float16, float32, int32, int8, uint8

 *@par Attributes:
@@ -1737,17 +1739,17 @@ round_prefer_ceil, floor, ceil. Only used by nearest interpolation.
 */

 REG_OP(Resize)
    .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32,
                                DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(roi, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(scales, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(sizes, TensorType({DT_INT64}))
    .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32,
                                DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_INT8,DT_UINT8,DT_INT16,DT_UINT16,DT_INT32,
                          DT_INT64,DT_FLOAT16,DT_FLOAT,DT_DOUBLE}))
    .OPTIONAL_INPUT(roi, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE}))
    .OPTIONAL_INPUT(scales, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(sizes, TensorType({DT_INT64,DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT8,DT_UINT8,DT_INT16,DT_UINT16,DT_INT32,
                           DT_INT64,DT_FLOAT16,DT_FLOAT,DT_DOUBLE}))
    .ATTR(coordinate_transformation_mode, String, "half_pixel")
    .ATTR(cubic_coeff_a, Float, -0.75)
    .ATTR(exclude_outside, Int, 0)
    .ATTR(extrapolation_value, Float, 0)
    .ATTR(extrapolation_value, Float, 0.0)
    .ATTR(mode, String, "nearest")
    .ATTR(nearest_mode, String, "round_prefer_floor")
    .OP_END_FACTORY_REG(Resize)
@@ -2309,6 +2311,32 @@ REG_OP(UpsampleNearest1dGrad)
    .ATTR(scales, ListFloat, {})
    .OP_END_FACTORY_REG(UpsampleNearest1dGrad)

 /**
 * @brief Function parse image from string to int. \n

 * @par Inputs:
 * contents: A Tensor of type string. 0-D. The JPEG, GIF, PNG, BMP-encoded image. \n

 * @par Attributes:
 * @li channels: An optional int. Defaults to 0. Number of color channels for the decoded image.
 * @li dtype: type of image
 * @li expand_animations: Controls the shape of the returned op's output. If 'true', the returned op will
 produce a 4-D tensor for GIF files. If 'false', the returned op will produce a 3-D tensor for GIF files.

 * @par Outputs:
 * image: A Tensor dtype of uint8, uint16 or float.

 * @par Restrictions:
 * Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DecodeImage)
    .INPUT(contents, TensorType({DT_STRING}))
    .OUTPUT(image, TensorType({DT_UINT8, DT_UINT16, DT_FLOAT}))
    .ATTR(channels, Int, 0)
    .ATTR(dtype, Type, DT_UINT8)
    .ATTR(expand_animations, Bool, true)
    .OP_END_FACTORY_REG(DecodeImage)

 /**
 * @brief JPEG encode input image with provided compression quality. \n

--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -425,7 +425,7 @@ REG_OP(EndOfSequence)

 *@par Inputs:
 *x: A Tensor of type float16, float32 or double. the format can be
 *    [NCHW,NC1HWC0,NHWC,ND]
 *    [NCHW,NHWC,ND]

 *@par Outputs:
 *y: A Tensor. Has the same type and format as "x" . \n
@@ -462,15 +462,15 @@ REG_OP(Erfc)

 *@par Inputs:
 *Three inputs, including:
 *@li x: A Tensor of type float32, float16, int32, int64.
 *@li range: A Tensor of type float32,float16,int32, int64.
 *@li x: A Tensor of type float32, int32, int64. float16 is currently not supported.
 *@li range: A Tensor of type float32, int32, int64. float16 is currently not supported.
 *@li nbins: A Tensor of type int32 . \n

 *@par Attributes:
 * dtype: An optional attribute. Defaults to "int32" . \n

 *@par Outputs:
 *y: A Tensor. A Tensor of type int32 or int64 . \n
 *y: A Tensor. A Tensor of type int32. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow operator HistogramFixedWidth.
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -24,6 +24,57 @@
 #include "graph/operator_reg.h"

 namespace ge {
 /**
 * @brief
             / (MatMul -> ConfusionTransposeD).
   LayerNorm - (MatMul -> ConfusionTransposeD).
             \ (MatMul -> ConfusionTransposeD). \n
 * @par Inputs:
 * Nine inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16.
 * @li kernel_query: A Tensor. Must be one of the following types: float16.
 * @li kernel_key: A Tensor. Must be one of the following types: float16.
 * @li kernel_value: A Tensor. Must be one of the following types: float16.
 * @li gamma: A Tensor. Must be one of the following types: float16.
 * @li beta: A Tensor. Must be one of the following types: float16.
 * @li bias_query: A Tensor. Must be one of the following types: float16.
 * @li bias_key: A Tensor. Must be one of the following types: float16.
 * @li bias_value: A Tensor. Must be one of the following types: float16. \n

 * @par Attributes:
 * @li epsilon: A optional attribute, the type is float32. Defaults to 1e-7.
 * @li trans_a: A optional attribute, the type is bool. Defaults to False.
 * @li trans_b: A optional attribute, the type is bool. Defaults to False. \n

 * @par Outputs:
 * Six outputs, including:
 * @li norm: A Tensor. Must be one of the following types: float16.
 * @li query_output: A Tensor. Must be one of the following types: float16.
 * @li key_output: A Tensor. Must be one of the following types: float16.
 * @li value_output: A Tensor. Must be one of the following types: float16.
 * @li mean: A Tensor. Must be one of the following types: float16.
 * @li variance: A Tensor. Must be one of the following types: float16. \n
 */
 REG_OP(AttentionLnQKV)
    .INPUT(x, TensorType({DT_FLOAT16}))
    .INPUT(kernel_query, TensorType({DT_FLOAT16}))
    .INPUT(kernel_key, TensorType({DT_FLOAT16}))
    .INPUT(kernel_value, TensorType({DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT16}))
    .INPUT(beta, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(bias_query, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(bias_key, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(bias_value, TensorType({DT_FLOAT16}))
    .OUTPUT(norm, TensorType({DT_FLOAT16}))
    .OUTPUT(query_output, TensorType({DT_FLOAT16}))
    .OUTPUT(key_output, TensorType({DT_FLOAT16}))
    .OUTPUT(value_output, TensorType({DT_FLOAT16}))
    .OUTPUT(mean, TensorType({DT_FLOAT16}))
    .OUTPUT(variance, TensorType({DT_FLOAT16}))
    .ATTR(epsilon, Float, 0.0000001)
    .ATTR(trans_a, Bool, false)
    .ATTR(trans_b, Bool, false)
    .OP_END_FACTORY_REG(AttentionLnQKV)

 /**
 *@brief Multiplies matrix "a" by matrix "b", producing "a * b" . \n
@@ -31,9 +82,9 @@ namespace ge {
 *@par Inputs:
 *Three inputs, including:
 * @li x1: A matrix Tensor. 2D. Must be one of the following types: float16,
 * float32, int32. Has format [ND, NHWC, FRACTAL_NZ].
 * float32, int32. Has format [ND, NHWC].
 * @li x2: A matrix Tensor. 2D. Must be one of the following types: float16,
 * float32, int32. Has format [ND, NHWC, FRACTAL_NZ].
 * float32, int32. Has format [ND, NHWC].
 * @li bias: A optional 1D Tensor. Must be one of the following types: float16,
 * float32, int32. Has format [ND, NHWC] . \n

@@ -43,7 +94,7 @@ namespace ge {

 *@par Outputs:
 *y: The result matrix Tensor. 2D. Must be one of the following types: float16,
 * float32, int32. Has format [ND, NHWC, FRACTAL_NZ] . \n
 * float32, int32. Has format [ND, NHWC] . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchMatmul.
@@ -63,9 +114,9 @@ REG_OP(MatMul)
 *@par Inputs:
 *Four inputs, including:
 * @li x1: A matrix Tensor. 2D. Must be one of the following types: float32,
 float16, int32, int8. Has format [ND, NHWC, FRACTAL_NZ].
 float16, int32, int8. Has format [ND, NHWC].
 * @li x2: A matrix Tensor. 2D. Must be one of the following types: float32,
 float16, int32, int8. Has format [ND, NHWC, FRACTAL_NZ].
 float16, int32, int8. Has format [ND, NHWC].
 * @li bias: A 1D Tensor. Must be one of the following types: float32,
 float16, int32. Has format [ND, NHWC].
 * @li offset_w: A Optional 1D Tensor for quantized inference. Type is int8.
@@ -82,7 +133,11 @@ REG_OP(MatMul)

 *@par Outputs:
 *y: The result matrix Tensor. 2D. Must be one of the following types: float32,
 float16, int32. Has format [ND, NHWC, FRACTAL_NZ]. \n
 float16, int32. Has format [ND, NHWC]. \n

 *@attention Constraints:
 * if performances better in format NZ, please close
 "MatmulTransdataFusionPass" in fusion configuration. \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchMatmul.
@@ -124,6 +179,10 @@ REG_OP(MatMulV2)
 *y: The result matrix Tensor. 2D. Must be one of the following types: int32,
 * float16. \n

 *@attention Constraints:
 * if performances better in format NZ, please close
 "MatmulTransdataFusionPass" in fusion configuration.

 */
 REG_OP(MatMulV2Compress)
    .INPUT(x1, TensorType({DT_INT8}))
@@ -138,7 +197,7 @@ REG_OP(MatMulV2Compress)
    .OP_END_FACTORY_REG(MatMulV2Compress)

 /**
 *@brief Performs Matrix-to-matrix Multiply, producing c=alpha[0]*a*b+beta[0]*c . \n
 *@brief Performs Matrix-to-matrix Multiply, producing y=alpha[0]*a*b+beta[0]*c . \n

 *@attention Constraints:
 * For better performance, The k-axis must be aligned to 16 (input type
@@ -147,24 +206,24 @@ REG_OP(MatMulV2Compress)
 *@par Inputs:
 *Five inputs, including:
 *@li a: A matrix Tensor. Must be one of the following types: float16, int8.
 * Has format [ND, FRACTAL_NZ]. 2D(ND) or 4D(FRACTAL_NZ).
 * Has format [ND].
 *@li b: A matrix Tensor. Must be one of the following types: float16, int8.
 * Has format [ND, FRACTAL_NZ, FRACTAL_Z]. 2D(ND) or 4D(FRACTAL_NZ, FRACTAL_Z).
 * Has format ND.
 *@li c: A matrix Tensor. Must be one of the following types: float16, int32,
 * float32. has format [ND, FRACTAL_NZ]. 2D(ND) or 4D(FRACTAL_NZ).
 * float32. has format ND.
 *@li alpha: A 1D Tensor. The shape of alpha is [1].Must be one of the following
 * types: float16, int32, float32. Has format [ND].
 *@li beta: A 1D Tensor. The shape of beta is [1]. Must be one of the following
 * types: float16, int32, float32. Has format [ND].
 * The format of a, b, c has restriction:\n
 * When type of a is int8 and type of c is int32, the format of a, b, c should
 * all be ND, or a is FRACTAL_NZ and b is FRACTAL_Z and c is ND.\n
 * all be ND.\n
 * When type of a is int8 and type of c is float32, the format of a, b, c should
 * all be ND or a is FRACTAL_NZ and b is FRACTAL_Z and c is FRACTAL_NZ.\n
 * all be ND.\n
 * When type of a is float16 and type of c is float16, the format of a, b, c
 * should all be ND or FRACTAL_NZ.\n
 * should all be ND.\n
 * When type of a is float16 and type of c is float32, the format of a, b, c
 * should all be ND or FRACTAL_NZ . \n
 * should all be ND. \n

 *@par Attributes:
 *Two attributes, including:
@@ -175,8 +234,7 @@ REG_OP(MatMulV2Compress)

 *@par Outputs:
 *y: The result matrix Tensor. Must be one of the following types: float16,
 * float32, int32. Has format [ND, FRACTAL_NZ], the format should be equal to a.
 * 2D(ND) or 4D(FRACTAL_NZ).
 * float32, int32. Has format [ND], the format should be equal to a.
 */

 REG_OP(GEMM)
@@ -196,9 +254,9 @@ REG_OP(GEMM)
 *@par Inputs:
 *Two inputs, including:
 * @li x1: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ].
 * float32, int32. 2D or higher. Has format [ND, NHWC].
 * @li x2: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ] . \n
 * float32, int32. 2D or higher. Has format [ND, NHWC] . \n

 *@par Attributes:
 *@li adj_x1: A bool. If True, changes the shape of "x1" from [B, M, K] to [B, K, M].
@@ -206,7 +264,7 @@ REG_OP(GEMM)

 *@par Outputs:
 *y: The result matrix Tensor. 2D or higher. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ]. Has the same shape length as "x1" and "x2" . \n
 * float32, int32. 2D or higher. Has format [ND, NHWC]. Has the same shape length as "x1" and "x2" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchMatmul.
@@ -227,11 +285,11 @@ REG_OP(BatchMatMul)
 * @par Inputs:
 * Three inputs, including:
 * @li x1: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ].
 * float32, int32. 2D or higher. Has format [ND, NHWC].
 * @li x2: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ] . \n
 * float32, int32. 2D or higher. Has format [ND, NHWC] . \n
 * @li bias: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ] . \n
 * float32, int32. 2D or higher. Has format [ND, NHWC] . \n

 * @par Attributes:
 * @li adj_x1: A bool. If True, changes the shape of "x1" from [B, M, K] to [B, K, M].
@@ -239,7 +297,11 @@ REG_OP(BatchMatMul)

 * @par Outputs:
 * y: The result matrix Tensor. 2D or higher. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ]. Has the same shape length as "x1" and "x2" . \n
 * float32, int32. 2D or higher. Has format [ND, NHWC]. Has the same shape length as "x1" and "x2" . \n

 *@attention Constraints:
 * if performances better in format NZ, please close
 "MatmulTransdataFusionPass" in fusion configuration. \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchMatmul.
@@ -265,7 +327,12 @@ REG_OP(BatchMatMulV2)
 *     TensorType::FloatingDataType() . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
 *y: A Tensor. Has the same type as "x". \n

 *@attention Constraints:
 * if performances better in format NZ, please close
 "MatmulTransdataFusionPass" in fusion configuration. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator L2Loss.
 */
@@ -405,6 +472,56 @@ REG_OP(MatrixSetDiagD)
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(MatrixSetDiagD)

 /**
 * @brief Function AttentionScore. \n

 * @par Inputs:
 * six inputs, including:
 * @li query: A matrix Tensor. The type only support float16.
 * @li key: A matrix Tensor. The type only support float16.
 * @li value: A matrix Tensor. The type only support float16.
 * @li padding_mask: A matrix Tensor. The type only support float16.
 * @li scale: A scalar. The type only support float16.
 * @li drop_mask: A matrix Tensor. The type only support uint8. \n

 * @par Attributes:
 * @li keep_prob: A mutable Tensor. Must met all of the following rules:
 shape of "keep_prob" should be (1,) or [1,].
 * @li query_transpose: A bool. If True, changes the shape of "query" from [K, M] to
 [M, K].
 * @li key_transpose: A bool. If True, changes the shape of "key" from [N, K] to
 [K, N].
 * @li bmm_score_transpose_a: A bool. If True, changes the shape of "mid_data" from [K, M] to
 [M, K].
 * @li bmm_score_transpose_b: A bool. If True, changes the shape of "value" from [N, K] to
 [K, N].
 * @li axes: A list of int. The dimension softmax would be performed on. Defaults
 to "[-1]" . \n

 * @par Outputs:
 * attention_score: The result matrix Tensor. The type only support float16.
 * softmax_output: The result matrix Tensor. The type only support float16.

 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(AttentionScore)
    .INPUT(query, TensorType({DT_FLOAT16}))
    .INPUT(key, TensorType({DT_FLOAT16}))
    .INPUT(value, TensorType({DT_FLOAT16}))
    .INPUT(padding_mask, TensorType({DT_FLOAT16}))
    .INPUT(scale, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(drop_mask, TensorType({DT_INT8}))
    .OUTPUT(attention_score, TensorType({DT_FLOAT16}))
    .OUTPUT(softmax_output, TensorType({DT_FLOAT16}))
    .ATTR(keep_prob, Float, 1.0)
    .ATTR(query_transpose, Bool, false)
    .ATTR(key_transpose, Bool, false)
    .ATTR(bmm_score_transpose_a, Bool, false)
    .ATTR(bmm_score_transpose_b, Bool, false)
    .ATTR(softmax_axes, ListInt, {-1})
    .OP_END_FACTORY_REG(AttentionScore)

 /**
 *@brief Applies sparse "updates" to individual values or slices in a Variable . \n

@@ -1211,7 +1328,7 @@ REG_OP(IndexAdd)
 *qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n

 * @li x2: A Tensor of the same type as "x1".
 * @li indices: A Tensor of the indices, 
 * @li indices: A Tensor of the indices,

 * @par Attributes:
 * @li accumulate: Does it support self accumulation.Defaults to 0.
@@ -1394,8 +1511,8 @@ REG_OP(Trace)

 */
 REG_OP(Pinverse)
    .INPUT(x, TensorType({ DT_FLOAT, DT_DOUBLE }))        
    .OUTPUT(y, TensorType({ DT_FLOAT, DT_DOUBLE })) 
    .INPUT(x, TensorType({ DT_FLOAT, DT_DOUBLE }))
    .OUTPUT(y, TensorType({ DT_FLOAT, DT_DOUBLE }))
    .ATTR(rcond, Float, 1e-15)
    .OP_END_FACTORY_REG(Pinverse)

--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -86,35 +86,37 @@ REG_OP(L2NormalizeGrad)
 *@brief Performs batch normalization . \n

 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
 * Five inputs, including: (NHWC, NCHW)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. 
 Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. 
 Specifies the mean used for inference. Must be "None" if the
 operation is used for training.
 *@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be
 5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
 *@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. 
 Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n

 *@par Attributes:
 *@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001".
 *@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. 
 Defaults to "0.0001".
 *@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. 
 Defaults to "True" . \n

 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean of "x".
 * Five outputs, including: (NHWC, NCHW)
 *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW.
 *@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. 
 Specifies the mean of "x".
 *@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
 Specifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 *@li reserve_space_3: An optional Tensor of type float32. For compatibility with tensorflow, only has one useless emement. \n
 *@li reserve_space_3: An optional Tensor of type float32. For compatibility with tensorflow, 
 only has one useless emement. \n

 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
@@ -264,17 +266,17 @@ REG_OP(SyncBatchNormBackwardElemt)
 *@brief Performs batch normalization . \n

 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li x: A 3D or 6D Tensor of type float16 or float32, with format NDHWC or NCDHW for 4D or NDC1HWC0 for 6D.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NDC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NC1HWC0. Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
 * Five inputs, including: (NHWC, NCHW)
 *@li x: A 3D or 6D Tensor of type float16 or float32, with format NDHWC or NCDHW.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW. 
 Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW.
 Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW.
 Specifies the mean used for inference. Must be "None" if the
 operation is used for training.
 *@li variance: A Tensor of type float32. Must be 3D if input "x" is with format NHWC or NCHW. Must be
 5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
 *@li variance: A Tensor of type float32. Must be 3D if input "x" is with format NHWC or NCHW.
 Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n

 *@par Attributes:
@@ -283,16 +285,16 @@ if the operation is used for training . \n
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n

 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li y: A 3D or 6D Tensor of type float16 or float32 for the normalized "x", with format NDHWC or NCDHW for 4D or NDC1HWC0 for 6D.
 *@li batch_mean: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NDC1HWC0. Specifies the mean of "x".
 * Five outputs, including: (NHWC, NCHW)
 *@li y: A 3D or 6D Tensor of type float16 or float32 for the normalized "x", with format NDHWC or NCDHW.
 *@li batch_mean: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW.
 Specifies the mean of "x".
 *@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW.
 Must be 6D if input "x" is with format NDC1HWC0. Specifies the variance of "x".
 Specifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW.
 Must be 6D if input "x" is with format NDC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 6D if input "x" is with format NDC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n
 Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n

 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
@@ -375,11 +377,11 @@ REG_OP(BatchNormExt2)

 *@par Inputs:
 * Five inputs, including:
 *@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the gradient.
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0.
 *@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0.
 *@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm.
 *@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm .
 *@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, for the gradient.
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW.
 *@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW.
 *@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW. It is an output of BatchNorm.
 *@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW. It is an output of BatchNorm .
 *@li reserve_space_3: A 1D optional Tensor of type float32. It is an output of BatchNorm . \n

 *@par Attributes:
@@ -388,11 +390,11 @@ REG_OP(BatchNormExt2)
 *@li is_training: An optional bool. Defaults to "true". Specifies the operation is for training (default) or inference . \n

 *@par Outputs:
 *@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "x".
 *@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "scale".
 *@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "offset".
 *@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output.
 *@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output . \n
 *@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, for the offset of "x".
 *@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, for the offset of "scale".
 *@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, for the offset of "offset".
 *@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW. Pass "None" to skip this output.
 *@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW. Pass "None" to skip this output . \n

 *@attention Constraints:
 * The preceding layer of this operator must be operator BatchNorm . \n
@@ -423,11 +425,11 @@ REG_OP(BatchNormGrad)

 *@par Inputs:
 * Five inputs, including:
 *@li y_backprop: A 3D or 6D Tensor of type float16 or float32, with format NDHWC, NCDHW, or NDC1HWC0, for the gradient.
 *@li x: A 3D or 6D Tensor of type float16 or float32, with format NDHWC, NCDHW, or NDC1HWC0.
 *@li scale: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW, or NDC1HWC0.
 *@li reserve_space_1: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW, or NC1HWC0. It is an output of BatchNorm.
 *@li reserve_space_2: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW, or NC1HWC0. It is an output of BatchNorm . \n
 *@li y_backprop: A 3D or 6D Tensor of type float16 or float32, with format NDHWC, NCDHW, for the gradient.
 *@li x: A 3D or 6D Tensor of type float16 or float32, with format NDHWC, NCDHW.
 *@li scale: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW.
 *@li reserve_space_1: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW. It is an output of BatchNorm.
 *@li reserve_space_2: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW. It is an output of BatchNorm . \n

 *@par Attributes:
 *@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x".
@@ -435,11 +437,11 @@ REG_OP(BatchNormGrad)
 *@li is_training: An optional bool. Defaults to "true". Specifies the operation is for training (default) or inference . \n

 *@par Outputs:
 *@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "x".
 *@li scale_backprop: A Tensor of type float32, with format NDHWC, NCDHW, or NDC1HWC0, for the offset of "scale".
 *@li *offset_backprop: A Tensor of type float32, with format NDHWC, NCDHW, or NDC1HWC0, for the offset of "offset".
 *@li *reserve_space_4: A Tensor of type float32, with shape NDHWC, NCDHW, or NDC1HWC0. Pass "None" to skip this output.
 *@li *reserve_space_5: A Tensor of type float32, with shape NDHWC, NCDHW, or NDC1HWC0. Pass "None" to skip this output . \n
 *@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, for the offset of "x".
 *@li scale_backprop: A Tensor of type float32, with format NDHWC, NCDHW, for the offset of "scale".
 *@li *offset_backprop: A Tensor of type float32, with format NDHWC, NCDHW, for the offset of "offset".
 *@li *reserve_space_4: A Tensor of type float32, with shape NDHWC, NCDHW. Pass "None" to skip this output.
 *@li *reserve_space_5: A Tensor of type float32, with shape NDHWC, NCDHW. Pass "None" to skip this output . \n

 *@attention Constraints:
 * The preceding layer of this operator must be operator BatchNorm . \n
@@ -515,7 +517,7 @@ REG_OP(BatchNormGradExt2)
 *@brief Performs batch normalization . \n

 *@par Inputs:
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW.
 *@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  Specifies the mean used for inference.
 *@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  Specifies the variance used for inference.
 *@li momentum: A Tensor,represents the mean and the variance's scale factor
@@ -545,7 +547,7 @@ REG_OP(BNInference)
 *@brief Performs batch normalization . \n

 *@par Inputs:
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW.
 *@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" Specifies the mean used for inference.
 *@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" Specifies the variance used for inference.
 *@li scale: An optional tensor of type float16 or float32, no use
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -268,7 +268,7 @@ REG_OP(ROIAlign)

 *@par Inputs:
 * Two inputs, including:
 *@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16.
 *@li x: An NCHW feature map of type is float32 or float16.
 *@li img: source image. Has the same type and format as "x" . \n

 *@par Attributes:
@@ -316,12 +316,12 @@ REG_OP(PriorBox)

 *@par Inputs:
 * Six inputs, including:
 *@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16.
 *@li x: An NCHW feature map of type is float32 or float16.
 *@li img: source image. Has the same type and format as "x".
 *@li data_h: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map height.
 *@li data_w: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map width.
 *@li box_height: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the height of each prior box.
 *@li box_width: An NC1HWC0 or NCHW tensor of type float32 or float16, specifying the width of each prior box . \n
 *@li data_h: An NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map height.
 *@li data_w: An NCHW tensor of type float32 or float16, specifying the matrix for indexing the feature map width.
 *@li box_height: An NCHW tensor of type float32 or float16, specifying the height of each prior box.
 *@li box_width: An NCHW tensor of type float32 or float16, specifying the width of each prior box . \n

 *@par Attributes:
 *@li min_size: A required float32, specifying the minimum edge length of a square prior box.
@@ -371,7 +371,7 @@ REG_OP(PriorBoxD)

 *@par Inputs:
 * Six inputs, including:
 *@li x: An NC1HWC0 or NCHW feature map of type is float32 or float16.
 *@li x: An NCHW feature map of type is float32 or float16.
 *@li img: source image. Has the same type and format as "x".
 *@li boxes: An ND tensor of type float32 or float16, specifying the prior box information. Same as output y

@@ -420,7 +420,7 @@ REG_OP(PriorBoxDV2)

 *@par Inputs:
 * Two inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature
 *@li x: A tensor of type float16 or float32, describing the feature
 * map, dimension C1 must be equal to
 * (int(output_dim+15)/C0))*group_size*group_size.
 *@li rois: A tensor of type float16 or float32, with shape
@@ -438,7 +438,7 @@ REG_OP(PriorBoxDV2)
 * coordinates to the ROI coordinates . \n

 *@par Outputs:
 *y: An NC1HWC0 tensor of type float16 or float32, describing the result
 *y: A tensor of type float16 or float32, describing the result
 * feature map . \n

 *@attention Constraints:
@@ -1171,7 +1171,7 @@ REG_OP(SPP)

 *@par Inputs:
 * Three inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature
 *@li x: A tensor of type float16 or float32, describing the feature
 * map. The data of x must be greater than or equal to "0.0".
 *@li rois: A tensor of type float16 or float32, with 3D shape
 * [batch, 5, roi_max_num], describing the RIOs. Each ROI consists of five
@@ -1195,7 +1195,7 @@ REG_OP(SPP)
 * coordinates of width to the ROI coordinates . \n

 *@par Outputs:
 *y: An NC1HWC0 tensor of type float16 or float32, describing the result
 *y: A tensor of type float16 or float32, describing the result
 * feature map . \n

 *@attention Constraints:
@@ -1844,6 +1844,7 @@ REG_OP(NonMaxSuppressionV7)
 REG_OP(RoiExtractor)
    .DYNAMIC_INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(index, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(finest_scale, Int, 56)
    .ATTR(roi_scale_factor, Float, 0)
@@ -1860,7 +1861,7 @@ REG_OP(RoiExtractor)

 *@par Inputs:
 * Two inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature
 *@li x: A tensor of type float16 or float32, describing the feature
 * map, dimension C1 must be equal to
 * (int(output_dim+15)/C0))*group_size*group_size.
 *@li rois: A tensor of type float16 or float32, with shape
@@ -1878,7 +1879,7 @@ REG_OP(RoiExtractor)
 * coordinates to the ROI coordinates . \n

 *@par Outputs:
 *y: An NC1HWC0 tensor of type float16 or float32, describing the result
 *y: A tensor of type float16 or float32, describing the result
 * feature map . \n

 *@attention Constraints:
@@ -1898,7 +1899,7 @@ REG_OP(PSROIPoolingV2)

 *@par Inputs:
 * Two inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the result
 *@li x: A tensor of type float16 or float32, describing the result
 * feature map . \n
 *@li rois: A tensor of type float16 or float32, with shape
 * [batch, 5, rois_num], describing the ROIs, each ROI consists of five
@@ -1916,7 +1917,7 @@ REG_OP(PSROIPoolingV2)
 *@li input_size: A required listInt, mapping the gradinput size: (H, W)

 *@par Outputs:
 *y: An NC1HWC0 tensor of type float16 or float32, describing the feature
 *y: A tensor of type float16 or float32, describing the feature
 * map, dimension C1 must be equal to
 * (int(output_dim+15)/C0))*group_size*group_size.

@@ -2030,7 +2031,7 @@ REG_OP(GridAssignPositive)
    .OP_END_FACTORY_REG(GridAssignPositive)

 /**
 *@brief GIoUGrad . \n
 * @brief Calculate the inverse gradient of GIoU. \n

 *@par Inputs:
 *@li dy : data of grad increment, a 1D Tensor of type float16 or float32 with
@@ -2064,17 +2065,88 @@ REG_OP(GIoUGrad)
    .OP_END_FACTORY_REG(GIoUGrad)

 /**
 *@brief RotatedOverlaps . \n
 * @brief Calculate the inverse gradient of DIoU. \n

 * @par Inputs:
 * @li dy : data of grad increment, a 1D Tensor of type float16 or float32 with
 * shape (N,).
 * @li bboxes: Bounding boxes, a 2D Tensor of type float16 or float32 with
 * shape (4, N). "N" indicates the number of bounding boxes, and the value
 * "4" refers to [x1, y1, x2, y2] or [x, y, w, h].
 * @li gtboxes: Ground-truth boxes, a 2D Tensor of type float16 or float32
 * with shape (4, M). "M" indicates the number of ground truth boxes, and
 * the value "4" refers to [x1, y1, x2, y2] or [x, y, w, h] . \n

 * @par Attributes:
 * @li trans: An optional attr, true for 'xywh', false for 'xyxy', only support true now.
 * @li is_cross: An optional attr, if false M equals N, only support false now.
 * @li mode: An optional attr, a character string with the value range of ['iou', 'iof'],
 *          only support 'iou' now. \n

 * @par Outputs:
 * @li dbboxes: A 2D Tensor of type float16 or float32 with shape [4, N].
 * @li dgtboxes: A 2D Tensor of type float16 or float32 with shape [4, M].
 */
 REG_OP(DIoUGrad)
    .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(gtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(dbboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(dgtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(trans, Bool, false)
    .ATTR(is_cross, Bool, true)
    .ATTR(mode, String, "iou")
    .OP_END_FACTORY_REG(DIoUGrad)

 /**
 * @brief Calculate the inverse gradient of CIoU. \n

 * @par Inputs:
 * @li dy : data of grad increment, a 1D Tensor of type float16 or float32 with
 * shape (N,).
 * @li bboxes: Bounding boxes, a 2D Tensor of type float16 or float32 with
 * shape (4, N). "N" indicates the number of bounding boxes, and the value
 * "4" refers to [x1, y1, x2, y2] or [x, y, w, h].
 * @li gtboxes: Ground-truth boxes, a 2D Tensor of type float16 or float32
 * with shape (4, M). "M" indicates the number of ground truth boxes, and
 * the value "4" refers to [x1, y1, x2, y2] or [x, y, w, h] .
 * @li atan_sub: Intermediate result of forward calculation, 
 * a 1D Tensor of type float16 or float32 with shape (N,). \n

 * @par Attributes:
 * @li trans: An optional attr, true for 'xywh', false for 'xyxy', only support true now.
 * @li is_cross: An optional attr, if false M equals N, only support false now.
 * @li mode: An optional attr, a character string with the value range of ['iou', 'iof'],
 *          only support 'iou' now. \n

 * @par Outputs:
 * @li dbboxes: A 2D Tensor of type float16 or float32 with shape [4, N].
 * @li dgtboxes: A 2D Tensor of type float16 or float32 with shape [4, M].
 */
 REG_OP(CIoUGrad)
    .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(gtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(atan_sub, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(dbboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(dgtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(trans, Bool, false)
    .ATTR(is_cross, Bool, true)
    .ATTR(mode, String, "iou")
    .OP_END_FACTORY_REG(CIoUGrad)

 /**
 * @brief RotatedOverlaps . \n

 *@par Inputs:
 *@li boxes : data of grad increment, a 3D Tensor of type float32 with
 * shape (B, 5, N). "N" indicates the number of boxes, and the value
 * "5" refers to [x1, y1, x2, y2, theta] or [x, y, w, h, theta].
 *@li query_boxes: Bounding boxes, a 3D Tensor of type float32 with
 * @li query_boxes: Bounding boxes, a 3D Tensor of type float32 with
 * shape (B, 5, K). "K" indicates the number of boxes, and the value
 * "5" refers to [x1, y1, x2, y2, theta] or [x, y, w, h, theta].

 *@par Attributes:
 * @par Attributes:
 * trans: An optional attr, true for 'xyxyt', false for 'xywht'.

 *@par Outputs:
@@ -2093,21 +2165,21 @@ REG_OP(RotatedOverlaps)
 /**
 *@brief RotatedIou . \n

 *@par Inputs:
 * @par Inputs:
 *@li boxes : data of grad increment, a 3D Tensor of type float32 with
 * shape (B, 5, N). "N" indicates the number of boxes, and the value
 * "5" refers to [x1, y1, x2, y2, theta] or [x, y, w, h, theta].
 *@li query_boxes: Bounding boxes, a 3D Tensor of type float32 with
 * @li query_boxes: Bounding boxes, a 3D Tensor of type float32 with
 * shape (B, 5, K). "K" indicates the number of boxes, and the value
 * "5" refers to [x1, y1, x2, y2, theta] or [x, y, w, h, theta].

 *@par Attributes:
 * @par Attributes:
 *@li trans: An optional attr, true for 'xyxyt', false for 'xywht'.
 *@li mode: An optional attr, a character string with the value range of ['iou', 'iof'],
 * @li mode: An optional attr, a character string with the value range of ['iou', 'iof'],
 * only support 'iou' now.
 *@li is_cross: Cross calculation when it is True, and one-to-one calculation when it is False.
 *@li v_threshold: An optional attr, provide condition relaxation for intersection calculation.
 *@li e_threshold: An optional attr, provide condition relaxation for intersection calculation.
 * @li e_threshold: An optional attr, provide condition relaxation for intersection calculation.

 *@par Outputs:
 * iou: A 3D Tensor of float32 with shape [B, N, K].
@@ -2140,7 +2212,7 @@ REG_OP(RotatedIou)
 * "N" indicates the number of bounding boxes, and the value "5" refers to
 * "x0", "x1", "y0", "y1" and "angle". \n

 *@par Attributes:
 * @par Attributes:
 *@li weight: A float list for "x0", "x1", "y0", "y1" and "angle",
 * defaults to [1.0, 1.0, 1.0, 1.0, 1.0].

@@ -2183,6 +2255,27 @@ REG_OP(RotatedBoxDecode)
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(weight, ListFloat, {1.0, 1.0, 1.0, 1.0, 1.0})
    .OP_END_FACTORY_REG(RotatedBoxDecode)

 /**
 * @brief sort rois to balance on each core. \n

 * @par Inputs:
 * one inputs, including:
 * @li rois: ROI position. A 2D Tensor of float32 or float16 with shape (N, 5). "N" indicates the number of ROIs,
 * the value "5" indicates the indexes of images where the ROIs are located, "batch", "x0", "y0", "x1", and "y1".

 * @par Outputs:
 * @li balance_rois:  A 2D Tensor of float32 or float16 with shape (N, 5), Outputs of the rois which balance.
 * @li index: 1D Tensor 0f int32 with shape (N,), that is the index of origin rois.

 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(BalanceRois)
    .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(balance_rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(index, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(BalanceRois)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -104,9 +104,8 @@ REG_OP(SoftmaxCrossEntropyWithLogits)
 *@par Inputs:
 * Two inputs, including:
 * @li softmax: Output of the softmax operator. Must be one of the following
 * types: float16, float31, int32, int8, uint8. The format is NC1HWC0 or DN.
 * @li grad_softmax: A Tensor. Has the same shape and type as "softmax".
 * The format is NC1HWC0 or DN . \n
 * types: float16, float31, int32, int8, uint8.
 * @li grad_softmax: A Tensor. Has the same shape and type as "softmax".\n

 *@par Attributes:
 * axes: An optional list of ints. Defaults to "{-1}" . \n
@@ -1101,8 +1100,8 @@ REG_OP(GroupNorm)
 *@brief Performs instance normalization . \n

 *@par Inputs:
 * Five inputs, including: (NC1HWC0, supported)
 *@li x: A 5D Tensor of type float16 or float32, NC1HWC0.
 * Five inputs, including:
 *@li x: A 5D Tensor of type float16 or float32.
 *@li gamma: A Tensor of type float32.
 A 5D Tensor for scaling factor, to scale the normalized x.
 *@li beta: A Tensor of type float32.
@@ -1121,7 +1120,7 @@ the value used for the running_mean and running_var computation. Default: "0.1".
 variance to avoid dividing by zero. Defaults to "0.00001" . \n

 *@par Outputs:
 * Three outputs, including: (NHWC, NCHW NC1HWC0 supported)
 * Three outputs, including: (NHWC, NCHW supported)
 *@li y: A 5D tensor of type float16 or float32 for the normalized "x",
 *@li batch_mean: A Tensor of type float32.
 Specifies the mean of "x".
@@ -1154,7 +1153,7 @@ REG_OP(InstanceNormV2)
 *@brief Performs instance normalization for inference.

 *@par Inputs:\n
 * Five inputs, including: (NC1HWC0 supported)
 * Five inputs, including:
 *@li x: A Tensor of type float16 or float32.
 *@li gamma: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling gamma.
 *@li beta: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling beta.
@@ -1740,5 +1739,77 @@ REG_OP(DropoutWithMulsAndSoftmaxGrad)
    .REQUIRED_ATTR(alpha, Float)
    .ATTR(axes, ListInt, { -1 })
    .OP_END_FACTORY_REG(DropoutWithMulsAndSoftmaxGrad)

 /**
 * @brief Loss function that measures the softmax cross entropy. \n

 * @par Inputs:
 * Three inputs, including:
 * @li scores: A Tensor. Must be one of the following types: half, float32, double.
 * A "batch_size * num_classes" matrix.
 * @li labels: A Tensor. Must be one of the following types: "int32", "int64".
 * @li weights: A manual rescaling weight given to each class. 
 * If given, it has to be a 1D Tensor assigning weight to each of the classes.
 * Otherwise, it is treated as if having all ones. \n

 * @par Attributes:
 * ignore_index:Specifies a target value that is ignored and does not contribute to the input gradient.
 * It's an optional value.
 * reduction: A character string from "none", "mean", and "sum", specifying the gradient output mode. Defaults to "mean" . \n

 * @par Outputs:
 * @li loss: A Tensor for per example loss (a "batch_size" vector). Has the same type as "scores".
 * @li log_prop: A Tensor. Has the same type as "scores" . \n

 * @par Third-party framework compatibility
 * Compatible with the ONNX operator SoftmaxCrossEntropyLoss.
 */
 REG_OP(SoftmaxCrossEntropyLoss)
    .INPUT(scores, TensorType({DT_DOUBLE,DT_FLOAT16,DT_FLOAT,DT_BFLOAT16}))
    .INPUT(labels, TensorType({DT_INT32, DT_INT64}))
    .OPTIONAL_INPUT(weights, TensorType({DT_DOUBLE,DT_FLOAT16,DT_FLOAT,DT_BFLOAT16}))
    .ATTR(ignore_index, Int, 0)
    .ATTR(reduction, String, "mean")
    .OUTPUT(loss, TensorType({DT_DOUBLE,DT_FLOAT16,DT_FLOAT,DT_BFLOAT16}))
    .OUTPUT(log_prop, TensorType({DT_DOUBLE,DT_FLOAT16,DT_FLOAT,DT_BFLOAT16}))
    .OP_END_FACTORY_REG(SoftmaxCrossEntropyLoss)

 /**
 * @brief Function axpy with softmax and dropoutdomask . \n

 * @par Inputs:
 * Three inputs, including:
 * @li x1: A mutable Tensor. The type only support float16.
 * @li x2: A mutable Tensor. The type only support float16.
 * @li mask: A mutable Tensor. Must meet all of the following rules:
 *     shape of mask should be 1D.
 *     dtype of mask should be uint8.
 *     value of shape should meet the following algorithm:
 *     value = (size(x) + 128 - 1) // 128 * 128 . \n

 * @par Attributes:
 * @li alpha: A attribute used to scale tensor. The type is float . \n
 * @li input_keep_prob: A attribute used to judge which units should be keep.
 *     The type is float . \n
 * @li axis: A list of int. The dimension softmax would be performed on. Defaults
 *     to "[-1]" . \n

 * @par Outputs:
 * y1: A mutable Tensor. Has the same type as "x1". \n
 * y2: A mutable Tensor. Has the same type as "x1". \n

 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(AxpyWithSoftmaxAndDropOutDoMask)
    .INPUT(x1, TensorType({DT_FLOAT16}))
    .INPUT(x2, TensorType({DT_FLOAT16}))
    .INPUT(mask, TensorType({DT_UINT8}))
    .OUTPUT(y1, TensorType({DT_FLOAT16}))
    .OUTPUT(y2, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(alpha, Float)
    .REQUIRED_ATTR(input_keep_prob, Float)
    .ATTR(axis, ListInt, {-1})
    .OP_END_FACTORY_REG(AxpyWithSoftmaxAndDropOutDoMask)
 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_NORM_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_ops.h
@@ -54,17 +54,17 @@ REG_OP(InTopKV2)
 *@brief Performs batch normalization . \n

 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
 * Five inputs, including: (NHWC, NCHW supported)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Specifies the mean used for inference. Must be "None" if the
 operation is used for training.
 *@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be
 5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
 *@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n

 *@par Attributes:
@@ -73,16 +73,16 @@ if the operation is used for training . \n
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n

 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean of "x".
 * Five outputs, including: (NHWC, NCHWsupported)
 *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D.
 *@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Specifies the mean of "x".
 *@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
 pecifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n
 Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n

 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
@@ -109,18 +109,20 @@ REG_OP(FusedBatchNormV2)
 * @brief Large amount of data sort.First operator of TopK.
 * @par Inputs:
 * two input, including:
 * @li input_data: A Tensor. Data to be sorted. Support float16
 * @li input_index: A Tensor. Range(0, 2048). Datatype and format is same as input_data.
 * @li input_data: A Tensor. Data to be sorted. Support float16 or float32.
 * @li input_index: A Tensor. Range(0, 2048). Support float16 or int32.
 * @par Attributes:
 * k_num: Int.Number to be sorted.
 * @par Outputs:
 * One output, including:
 * output_proposal: A Tensor. Datatype and format is same as input_data. Proposal sorted for each channel.
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(SegmentSort)
    .INPUT(input_data, TensorType({DT_FLOAT16}))
    .INPUT(input_index, TensorType({DT_FLOAT16}))
    .OUTPUT(output_proposal, TensorType({DT_FLOAT16}))
    .INPUT(input_data, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input_index, TensorType({DT_FLOAT16,DT_INT32}))
    .OUTPUT(output_proposal, TensorType({DT_FLOAT16,DT_FLOAT}))
    .REQUIRED_ATTR(k_num, Int)
    .OP_END_FACTORY_REG(SegmentSort)

@@ -128,17 +130,23 @@ REG_OP(SegmentSort)
 * @brief: Large amount of data sort.Second operator of TopK.
 * @par Inputs:
 * One input, including:
 * input_proposal: A Tensor. Proposal sorted for each channel. Support float16
 * input_proposal: A Tensor. Proposal sorted for each channel. Support float16 or float32
 * @par Attributes:
 * k_num: Int.Number to be sorted.
 * include_index: Bool.include_index is false,output proposal. include_index is true, output data and index.
 * @par Outputs:
 * One output, including:
 * Two output, including:
 * output_proposal: A Tensor. Datatype and format is same as input_data. Proposal sorted for each channel.
 * output_index: A Tensor.If include_index is true, output index.
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(MultiMerge)
    .INPUT(input_proposal, TensorType({DT_FLOAT16}))
    .OUTPUT(output_proposal, TensorType({DT_FLOAT16}))
    .INPUT(input_proposal, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(output_proposal, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(output_index, TensorType({DT_INT32}))
    .REQUIRED_ATTR(k_num, Int)
    .ATTR(include_index, Bool, false)
    .OP_END_FACTORY_REG(MultiMerge)

 /**
@@ -152,12 +160,166 @@ REG_OP(MultiMerge)
 * Two output, including:
 * @li output_data: A Tensor. Datatype and format is same as input_data. Data sorted.
 * @li output_index: A Tensor. int32. Data index.
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(SingleMerge)
    .INPUT(input_proposal, TensorType({DT_FLOAT16}))
    .OUTPUT(output_data, TensorType({DT_FLOAT16}))
    .OUTPUT(output_index, TensorType({DT_INT32}))
    .INPUT(input_proposal, TensorType({ DT_FLOAT16 }))
    .OUTPUT(output_data, TensorType({ DT_FLOAT16 }))
    .OUTPUT(output_index, TensorType({ DT_INT32 }))
    .REQUIRED_ATTR(k_num, Int)
    .OP_END_FACTORY_REG(SingleMerge)

 /**
 * @brief MultiHeadAttention.
 * @par Inputs:
 * thirteen input, including:
 * @li query: A Tensor. Query of Attention. Support float16
 * @li key: A Tensor. Key of Attention. Support float16
 * @li value: A Tensor. Value of Attention. Support float16
 * @li query_weight: A Tensor. QueryWeight of Attention. Support float16
 * @li key_weight: A Tensor. KeyWeight of Attention. Support float16
 * @li value_weight: A Tensor. ValueWeight of Attention. Support float16
 * @li attn_mask: A Tensor. AttentionMask of Attention. Support float16
 * @li out_proj_weight: A Tensor. OutProjWeight of Attention. Support float16
 * @li query_bias: Optional Tensor. QueryBias of Attention. Support float16
 * @li key_bias: Optional Tensor. KeyBias of Attention. Support float16
 * @li value_bias: Optional Tensor. ValueBias of Attention. Support float16
 * @li out_proj_bias: Optional Tensor. OutProjBias of Attention. Support float16
 * @li dropout_mask_input: Optional Tensor. DropOutMask of Attention. Support uint8 \n

 * @par Attributes:
 * @li attn_head_num: Attention Head numbers, Support int
 * @li attn_dim_per_head: Attention dim of a Head, Support int
 * @li src_len: source length, Support int
 * @li tgt_len: target length, Support int
 * @li keep_prob: dropout keep probability, Support float
 * @li softmax_use_float: SoftMax Use Float32 to keep precision, Support bool \n

 * @par Outputs:
 * Eight output, including:
 * @li y: A Tensor. Result of Attention. Support float16
 * @li dropout_mask: DropOutMask of Attention. Support uint8
 * @li query_res: Query Result of Attention. Support float16
 * @li key_res: Key Result of Attention. Support float16
 * @li value_res: Value Result of Attention. Support float16
 * @li attn_scores: Attention Scores of SoftMax. Support float16, float
 * @li attn_res: Attention Result of SoftMax. Support float16
 * @li context: Context of Attention. Support float16

 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(MultiHeadAttention)
    .INPUT(query, TensorType({DT_FLOAT16}))
    .INPUT(key, TensorType({DT_FLOAT16}))
    .INPUT(value, TensorType({DT_FLOAT16}))
    .INPUT(query_weight, TensorType({DT_FLOAT16}))
    .INPUT(key_weight, TensorType({DT_FLOAT16}))
    .INPUT(value_weight, TensorType({DT_FLOAT16}))
    .INPUT(attn_mask, TensorType({DT_FLOAT16}))
    .INPUT(out_proj_weight, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(query_bias, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(key_bias, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(value_bias, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(out_proj_bias, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(dropout_mask_input, TensorType({DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16}))
    .OUTPUT(dropout_mask, TensorType({DT_UINT8}))
    .OUTPUT(query_res, TensorType({DT_FLOAT16}))
    .OUTPUT(key_res, TensorType({DT_FLOAT16}))
    .OUTPUT(value_res, TensorType({DT_FLOAT16}))
    .OUTPUT(attn_scores, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(attn_res, TensorType({DT_FLOAT16}))
    .OUTPUT(context, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(attn_head_num, Int)
    .REQUIRED_ATTR(attn_dim_per_head, Int)
    .REQUIRED_ATTR(src_len, Int)
    .REQUIRED_ATTR(tgt_len, Int)
    .REQUIRED_ATTR(keep_prob, Float)
    .REQUIRED_ATTR(softmax_use_float, Bool)
    .OP_END_FACTORY_REG(MultiHeadAttention)

 /**
 * @brief MultiHeadAttentionGrad.
 * @par Inputs:
 * thirteen input, including:
 * @li query: A Tensor. Query of Attention. Support float16
 * @li key: A Tensor. Key of Attention. Support float16
 * @li value: A Tensor. Value of Attention. Support float16
 * @li query_weight: A Tensor. QueryWeight of Attention. Support float16
 * @li key_weight: A Tensor. KeyWeight of Attention. Support float16
 * @li value_weight: A Tensor. ValueWeight of Attention. Support float16
 * @li out_proj_weight: A Tensor. OutProjWeight of Attention. Support float16
 * @li query_res: A Tensor. Query Result of Attention. Support float16
 * @li key_res: A Tensor. Key Result of Attention. Support float16
 * @li value_res: A Tensor. Value Result of Attention. Support float16
 * @li attn_scores: A Tensor. Attention Scores of Attention. Support float16, float
 * @li attn_res: A Tensor. Attention Result of Attention. Support float16
 * @li context: A Tensor. Context of Attention. Support float16
 * @li y_grad: A Tensor. Grad of Attention. Support float16
 * @li dropout_mask: : A Tensor. Query Result of Attention. Support uint8 \n

 * @par Attributes:
 * @li attn_head_num: Attention Head numbers, Support int
 * @li attn_dim_per_head: Attention dim of a Head, Support int
 * @li src_len: source length, Support int
 * @li tgt_len: target length, Support int
 * @li keep_prob: dropout keep probability, Support float
 * @li softmax_use_float: SoftMax Use Float32 to keep precision, Support bool
 * @li bias_grad_mask: mask for attention has bias grad, Support list bool  \n

 * @par Outputs:
 * Eight output, including:
 * @li query_weight_grad: QueryWeight Grad of Attention. Support float16
 * @li key_weight_grad: KeyWeight Grad of Attention. Support float16
 * @li value_weight_grad: ValueWeight Grad of Attention. Support float16
 * @li out_proj_weight_grad: OutProjWeight Grad of Attention. Support float16
 * @li query_grad: Query Grad of Attention. Support float16
 * @li key_grad: Key Grad of Attention. Support float16
 * @li value_grad: Value Grad of Attention. Support float16
 * @li query_bias_grad: QueryBias Grad of Attention. Support float16
 * @li key_bias_grad: KeyBias Grad of Attention. Support float16
 * @li value_bias_grad: ValueBias Grad of Attention. Support float16
 * @li out_proj_bias_grad: OutProjBias Grad of Attention. Support float16

 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(MultiHeadAttentionGrad)
    .INPUT(query, TensorType({DT_FLOAT16}))
    .INPUT(key, TensorType({DT_FLOAT16}))
    .INPUT(value, TensorType({DT_FLOAT16}))
    .INPUT(query_weight, TensorType({DT_FLOAT16}))
    .INPUT(key_weight, TensorType({DT_FLOAT16}))
    .INPUT(value_weight, TensorType({DT_FLOAT16}))
    .INPUT(out_proj_weight, TensorType({DT_FLOAT16}))
    .INPUT(query_res, TensorType({DT_FLOAT16}))
    .INPUT(key_res, TensorType({DT_FLOAT16}))
    .INPUT(value_res, TensorType({DT_FLOAT16}))
    .INPUT(attn_scores, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(attn_res, TensorType({DT_FLOAT16}))
    .INPUT(context, TensorType({DT_FLOAT16}))
    .INPUT(y_grad, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(dropout_mask, TensorType({DT_UINT8}))
    .OUTPUT(query_weight_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(key_weight_grad, TensorType({DT_UINT8}))
    .OUTPUT(value_weight_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(out_proj_weight_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(query_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(key_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(value_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(query_bias_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(key_bias_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(value_bias_grad, TensorType({DT_FLOAT16}))
    .OUTPUT(out_proj_bias_grad, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(attn_head_num, Int)
    .REQUIRED_ATTR(attn_dim_per_head, Int)
    .REQUIRED_ATTR(src_len, Int)
    .REQUIRED_ATTR(tgt_len, Int)
    .REQUIRED_ATTR(keep_prob, Float)
    .REQUIRED_ATTR(softmax_use_float, Bool)
    .REQUIRED_ATTR(bias_grad_mask, ListBool)
    .OP_END_FACTORY_REG(MultiHeadAttentionGrad)
 }// namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -81,10 +81,16 @@ REG_OP(Pooling)
 *x: A tensor of type float16, float32, double . \n

 *@par Attributes:
 *@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window, where N = C = 1, and H and W are positive integers within the range [1, 255].
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimensions are 1. The strides of the H and W dimensions are positive integers within the range [1, 63].
 *@li padding: A required string, specifying the padding algorithm, either "VALID" or "SAME". With "SAME" means that the outputs will have the same spatial dimensions as its inputs. With "VALID" means no padding.
 *@li data_format: An optional string, specifying the data format of "ksize" and "strides", either "NCHW", "NC1HWC0", or "NHWC" (default) . \n
 *@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window,
 * where N = C = 1, and H and W are positive integers within the range [1, 255].
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window.
 * The strides of the N and C dimensions are 1.
 * The strides of the H and W dimensions are positive integers within the range [1, 63].
 *@li padding: A required string, specifying the padding algorithm,
 * either "VALID" or "SAME". With "SAME" means that the outputs will have the same spatial dimensions as its inputs.
 * With "VALID" means no padding.
 *@li data_format: An optional string, specifying the data format of "ksize" and "strides",
 * either "NCHW", or "NHWC" (default) . \n

 *@par Outputs:
 *y: The average pooled output tensor. Has the same type and format as input "x" . \n
@@ -94,7 +100,8 @@ REG_OP(Pooling)
 *@li Only single input and single output are supported.
 *@li Global pooling is supported.
 *@li "ksize_H" and "ksize_W" are positive integers within the range [1, 255]. ksize_H * ksize_W < 256
 *@li Due to instruction restrictions, the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
 *@li Due to instruction restrictions,
 * the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator AvgPool.
 */
@@ -114,11 +121,18 @@ REG_OP(AvgPool)
 *x: A tensor of type float16, float32, double.

 *@par Attributes:
 *@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window, where N = C = 1, and H and W are positive integers within the range [1, 255].
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimensions are 1. The strides of the H and W dimensions are positive integers within the range [1, 63].
 *@li padding_mode: A required string, specifying the padding algorithm, either "VALID", "SAME" and "CALCULATED". With "SAME" means that the outputs will have the same spatial dimensions as its inputs. With "VALID" means no padding.
 *@li ksize: A required list of 4 ints, specifying the size (N, C, H, and W) of the sliding window,
 * where N = C = 1, and H and W are positive integers within the range [1, 255].
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window.
 * The strides of the N and C dimensions are 1.
 * The strides of the H and W dimensions are positive integers within the range [1, 63].
 *@li padding_mode: A required string, specifying the padding algorithm,
 * either "VALID", "SAME" and "CALCULATED".
 * With "SAME" means that the outputs will have the same spatial dimensions as its inputs.
 * With "VALID" means no padding.
 *@li pads: Pad value when padding_mode is "CALCULATED".
 *@li data_format: An optional string, specifying the data format of "ksize" and "strides", either "NCHW", "NC1HWC0", or "NHWC" (default).
 *@li data_format: An optional string, specifying the data format of "ksize" and "strides",
 * either "NCHW", or "NHWC" (default).
 *@li global_pooling: Global or not. If true, pads will change to {0,0,0,0} and ksize will change to [input_h, input_w]
 *@li ceil_mode: Use ceil or floor to calculate the output size when padding_mode is "CALCULATED".
 *@li exclusive: Ignore padding area or not when calculating average.
@@ -130,7 +144,8 @@ REG_OP(AvgPool)
 *@li Only single input and single output are supported.
 *@li Global pooling is supported.
 *@li "ksize_H" and "ksize_W" are positive integers within the range [1, 255]. ksize_H * ksize_W < 256
 *@li Due to instruction restrictions, the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
 *@li Due to instruction restrictions,
 * the values of "strides_h" and "strides_w" are positive integers within the range [1, 63].
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator AvgPoolV2.
 */
@@ -310,21 +325,24 @@ REG_OP(AvgPool3DGradD)

 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor of type float16.
 *x: A Tensor of type float16.


 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for each dimension of the input tensor. No default value.
 *@li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for each dimension of the input tensor. No default value.
 *@li ksize: A required list of int8, int16, int32, or int64 values,
 * specifying the size of the window for each dimension of the input tensor. No default value.
 *@li strides: A required list of int8, int16, int32, or int64 values,
 * specifying the stride of the sliding window for each dimension of the input tensor. No default value.
 *@li padding: A required string. No default value.
 *@li data_format: An optional string. Defaults to "NC1HWC0" . \n
 *@li data_format: An optional string . \n

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x" . \n

 *@attention Constraints:
 *@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 *@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
 *@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1,
 * strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
 *@li "padding" is either "SAME" or "VALID" . \n

 *@par Third-party framework compatibility
@@ -348,7 +366,7 @@ REG_OP(MaxPoolExt2)

 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int8, int16,
 *x: A Tensor. Supported type:float16, float32, double, int8, int16,
 * int32, int64, uint8, uint16, qint8

 *@par Attributes:
@@ -391,7 +409,7 @@ REG_OP(MaxPool)
 *@brief Performs max 3d pooling on the input . \n

 *@par Inputs:
 *x: An NC1HWC0 Tensor. Supported type float16, float32, double . \n
 *x: A Tensor. Supported type float16, float32, double . \n

 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values,
@@ -457,7 +475,6 @@ REG_OP(MaxPool3D)
 *  y: An 6D tensor. the maxpool3d output(max value), format as NDoC1HoWoC0.
 * @par Outputs:
 *  argmax: A 5D uint16 tensor. the indice output.
 *  format as NC1HWC0, actually it represent N, Do, C1*ksize, Ho*Wo//16, 16.
 */
 REG_OP(MaxPool3DWithArgmax)
    .INPUT(x, TensorType::RealNumberType())
@@ -546,9 +563,9 @@ REG_OP(MaxPool3DGradGrad)
 * @brief Computes gradients of the maxpooling function . \n

 * @par Inputs:
 * @li x1: A mutable NC1HWC0 tensor of type RealNumberType.
 * @li x2: A mutable NC1HWC0 tensor of type RealNumberTypex.
 * @li grad: A mutable NC1HWC0 tensor of type RealNumberType . \n
 * @li x1: A mutable tensor of type RealNumberType.
 * @li x2: A mutable tensor of type RealNumberTypex.
 * @li grad: A mutable tensor of type RealNumberType . \n

 * @par Attributes:
 * @li ksize: A required tuple or list, specifying the size of the window for
@@ -630,21 +647,24 @@ REG_OP(MaxPoolGradGrad)

 *@par Inputs:
 * Three inputs:
 *@li x: An NC1HWC0 Tensor of type float16.
 *@li strides: A required type of int32 values, specifying the stride of the sliding window for each dimension of the input tensor. No default value.
 *@li ksize: A required type of int32 values, specifying the size of the window for each dimension of the input tensor. No default value.
 *@li x: A Tensor of type float16.
 *@li strides: A required type of int32 values,
 * specifying the stride of the sliding window for each dimension of the input tensor. No default value.
 *@li ksize: A required type of int32 values,
 * specifying the size of the window for each dimension of the input tensor. No default value.


 *@par Attributes:
 *@li padding: A required string. No default value.
 *@li data_format: An optional string. Defaults to "NC1HWC0" . \n
 *@li data_format: An optional string. \n

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x" . \n

 *@attention Constraints:
 *@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 *@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
 *@li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1,
 * strides[2] <= 63, strides[2] >= 1.
 *@li "padding" is either "SAME" or "VALID" . \n

 *@par Third-party framework compatibility
@@ -713,7 +733,7 @@ REG_OP(MaxPoolWithArgmax)
 *@li grad: An 4d tensor. Supported type: float, double, int32,
 * uint8, int16, int8, int64, uint16, half, uint32, uint64.
 * Must set the format, supported format list ["NCHW, NHWC"]
 *@li argmx: An NC1HWC0 tensor of type int32 or int64 . \n
 *@li argmx: A tensor of type int32 or int64 . \n

 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values,
@@ -753,8 +773,8 @@ REG_OP(MaxPoolGradWithArgmax)

 *@par Inputs:
 * Two inputs:
 *@li x: An NC1HWC0 Tensor of type float16.
 *@li mask: An NC1HWC0 Tensor of type uint16 . \n
 *@li x: A Tensor of type float16.
 *@li mask: A Tensor of type uint16 . \n

 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for each dimension of the input tensor. No default value.
@@ -763,7 +783,7 @@ REG_OP(MaxPoolGradWithArgmax)
 *@li originshape:A required list of int8, int16, int32, or int64 values, No default value. \n

 *@par Outputs:
 *argmax: An NC1HWC0 Tensor of type int32 . \n
 *argmax: A Tensor of type int32 . \n

 *@attention Constraints:
 *@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
@@ -1314,7 +1334,7 @@ REG_OP(AvgPool1DD)

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x".
 *argmax:  A Tensor. type:uint16, format:NC1HWC0.
 *argmax:  A Tensor. type:uint16.
 *@attention Constraints:
 *@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 *@li "strides is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1,
@@ -1388,7 +1408,7 @@ REG_OP(MaxPoolGradWithArgmaxV2)

 * @par Inputs:
 * One input:
 * x: An NC1HWC0 Tensor. Supported type:float16, float32, double, int32, int64,
 * x: A Tensor. Supported type:float16, float32, double, int32, int64,
 * uint8, int16, int8, uint16, qint8

 * @par Attributes:
@@ -1400,9 +1420,8 @@ REG_OP(MaxPoolGradWithArgmaxV2)
 * the input tensor. No default value.
 * @li padding_mode: A required string. Defaults to "CALCULATED".
 * @li pads:A required list of int8, int16, int32, or int64 values,
 * a data to caculate when padding_mode is "CALCULATED".
 * a data to calculate when padding_mode is "CALCULATED".
 * @li data_format: An optional string. Defaults to "NHWC" .
 * If data_format = "NC1HWC0", ori_format must be "NCHW".
 * @li global_pooling bool, Whether to use the global pooling.
 * If global_pooling = true, kernel size and paddings will be ignored.
 * Default False
@@ -1418,7 +1437,7 @@ REG_OP(MaxPoolGradWithArgmaxV2)
 * ksize[1] * ksize[2] <= 255.
 * @li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1,
 * strides[1] <= 63, strides[0] >= 1, strides[2] <= 63, strides[2] >= 1.
 * @li "padding" is  "SAME" "VALID" or "CACULATE" .
 * @li "padding" is  "SAME" "VALID" or "CALCULATE" .


 * @par Third-party framework compatibility
@@ -1440,9 +1459,9 @@ REG_OP(MaxPoolV3)
 * @brief Computes gradients of the maxpooling function . \n

 * @par Inputs:
 * @li orig_input: A mutable NC1HWC0 tensor of type RealNumberType.
 * @li orig_output: A mutable NC1HWC0 tensor of type RealNumberTypex.
 * @li grad: A mutable NC1HWC0 tensor of type RealNumberType . \n
 * @li orig_input: A mutable tensor of type RealNumberType.
 * @li orig_output: A mutable tensor of type RealNumberTypex.
 * @li grad: A mutable tensor of type RealNumberType . \n

 * @par Attributes:
 * @li ksize: A required list of int8, int16, int32, or int64 values,
@@ -1650,9 +1669,9 @@ REG_OP(AdaptiveAvgPool2dGrad)

 * @par Inputs:
 * Three inputs, including:
 * @li x: An NC1HWC0 tensor of type float16.
 * @li grad: An NC1HWC0 tensor of type float16.
 * @li argmax: An NC1HWC0 tensor of type uint16 or int64. \n
 * @li x: A tensor of type float16.
 * @li grad: A tensor of type float16.
 * @li argmax: A tensor of type uint16 or int64. \n

 * @par Attributes:
 * @li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for
@@ -1665,11 +1684,11 @@ REG_OP(AdaptiveAvgPool2dGrad)
 * y: A Tensor. Has the same type and format as input "x". \n

 * @attention Constraints:
 * @li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 * @li "strides" is a list that has length 4: strides[0] = 1 or strides[3] = 1
 * @li "pads" is listint.
 * @li "ceil_mode" defaults to False.
 * @li "data_format" defaults to "NC1HWC0". \n
 * @li ksize: is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 * @li strides: is a list that has length 4: strides[0] = 1 or strides[3] = 1
 * @li pads: listint.
 * @li ceil_mode: defaults to False.
 * @li data_format: A optional string. \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator MaxPoolGradWithArgmaxV1.
@@ -1693,7 +1712,7 @@ REG_OP(MaxPoolGradWithArgmaxV1)

 * @par Inputs:
 * One input:
 * x: An NC1HWC0 Tensor of type float16. \n
 * x: A Tensor of type float16. \n

 * @par Attributes:
 * @li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for
@@ -1704,15 +1723,15 @@ REG_OP(MaxPoolGradWithArgmaxV1)

 * @par Outputs:
 * y: A Tensor. Has the same type and format as input "x".
 * argmax:  A Tensor. type:uint16, format:NC1HWC0. \n
 * argmax:  A Tensor. type:uint16. \n

 * @attention Constraints:
 * @li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 * @li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1,
 * @li ksize: a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 * @li stride: a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1,
 * strides[2] <= 63, strides[2] >= 1.
 * @li "pads" is listint.
 * @li "ceil_mode" defaults to False.
 * @li "data_format" defaults to "NC1HWC0". \n
 * @li pads: listint.
 * @li ceil_mode: defaults to False.
 * @li data_format: A optional string. \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator MaxPoolWithArgmaxV1.
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -159,6 +159,7 @@ REG_OP(SparseApplyAdagrad)
    .INPUT(grad, TensorType({DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT}))
    .OUTPUT(accum, TensorType({DT_FLOAT}))
    .ATTR(use_locking, Bool, false)
    .ATTR(update_slots, Bool, true)
    .OP_END_FACTORY_REG(SparseApplyAdagrad)
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -423,8 +423,8 @@ REG_OP(Softplus)

 *@par Inputs:
 *Two inputs:
 * @li gradients: An NC1HWC0 or ND Tensor of type float16 or float32.
 * @li features: An NC1HWC0 or ND Tensor of type float16 or float32.
 * @li gradients: A ND Tensor of type float16 or float32.
 * @li features: A ND Tensor of type float16 or float32.


 *@par Outputs:
@@ -457,16 +457,35 @@ REG_OP(Softsign)
    .OUTPUT(y, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(Softsign)

 /**
 * @brief Computes softsignGrad: y_grad / (1 + abs(x)) ** 2 .
 *
 * @par Inputs:
 * Two inputs, including:
 * @li y_grad: A Tensor.Must be one of the following types:float16, float32,
 * @li x: A Tensor of the same type and shape as "gradients".

 * @par x_grad:
 * output:A Tensor. Has the same type as "y_grad".
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator SoftsignGrad.
 */
 REG_OP(SoftsignGrad)
    .INPUT(y_grad, TensorType::FloatingDataType())
    .INPUT(x, TensorType::FloatingDataType())
    .OUTPUT(x_grad, TensorType::FloatingDataType())
    .OP_END_FACTORY_REG(SoftsignGrad)

 /**
 *@brief Computes scaled exponential linear: scale * alpha * (exp(x) - 1) . \n

 *@par Inputs:
 * One input:
 *x: A Tensor. Must be one of the following types: float16, float, double
 * int32, int8. format:ND, NC1HWC0 . \n
 * int32, int8. format:ND. \n

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x". format:ND, NC1HWC0 . \n
 *y: A Tensor. Has the same type and format as input "x". format:ND. \n

 *@see Region()

@@ -480,6 +499,26 @@ REG_OP(Selu)
                                     DT_INT8,DT_INT32}))
    .OP_END_FACTORY_REG(Selu)

 /**
 * @brief Computes SeluGrad backprops: y_grad * (y + scale * alpha)
 *    if y < 0, scale * y_grad otherwise .

 * @par Inputs:
 * Two inputs, including:
 * @li y_grad: A Tensor of type RealNumberType .
 * @li y: A Tensor of type RealNumberType .
 * @par Outputs:
 * x_grad: A Tensor. Must have the same type as "y_grad" .

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator SeluGrad.
 */
 REG_OP(SeluGrad)
    .INPUT(y_grad, TensorType::RealNumberType())
    .INPUT(y, TensorType::RealNumberType())
    .OUTPUT(x_grad, TensorType::RealNumberType())
    .OP_END_FACTORY_REG(SeluGrad)

 /**
 *@brief Computes rectified linear gradients for a ReLU operation . \n

@@ -640,7 +679,9 @@ REG_OP(Elu)
 *x: A float16, float32, for the input data type . \n

 *@par Attributes:
 *li alpha: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
 *@li alpha1: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
 *@li alpha2: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
 *@li alpha3: A float32. Defines at which positive value the ELU saturates. Defaults to "1.0" . \n

 *@par Outputs:
 *y: A float16, float32, for the normalized result . \n
@@ -656,9 +697,39 @@ REG_OP(Elu)
 REG_OP(Celu)
    .INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT,DT_FLOAT16}))
    .ATTR(alpha, Float, 1.0)
    .ATTR(alpha1, Float, 1.0)
    .ATTR(alpha2, Float, 1.0)
    .ATTR(alpha3, Float, 1.0)
    .OP_END_FACTORY_REG(Celu)

 /**
 *@brief Continuously Differentiable Exponential Linear Uints:
 *       Perform the linear uint element-wise on the input tensor X using formula:
 *       max(0, x) + min(0, alpha * (exp(x/alpha) - 1)). \n

 *@par Inputs:
 *x: A float16, float32, for the input data type . \n

 *@par Attributes:
 *li alpha: A float32. Defines at which negative value the CELU saturates. Defaults to "1.0" .

 *@par Outputs:
 *y: A float16, float32, for the normalized result . \n

 *@attention Constraints:
 *@li The input is of type float16 or float32 . \n

 *@par Multiple batches supported or not
 *Supported
 *@par Third-party framework compatibility
 *@li Compatible with ONNX's Celu operator
 */
 REG_OP(CeluV2)
    .INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT,DT_FLOAT16}))
    .ATTR(alpha, Float, 1.0)
    .OP_END_FACTORY_REG(CeluV2)

 /**
 *@brief Computes gradients for the exponential linear (Elu) operation.
 *
--- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
+++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
@@ -141,8 +141,7 @@ REG_OP(NPUClearFloatStatusV2)
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(NPUGetFloatStatusV2)
    .DYNAMIC_INPUT(addr, TensorType{DT_FLOAT})
    .OUTPUT(data, TensorType({DT_FLOAT}))
    .OUTPUT(data, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(NPUGetFloatStatusV2)
 }  // namespace ge

--- a/third_party/fwkacllib/inc/ops/quantize_ops.h
+++ b/third_party/fwkacllib/inc/ops/quantize_ops.h
@@ -115,13 +115,13 @@ REG_OP(AscendQuant)
 *@brief Dequantizes the input . \n

 *@par Inputs:
 *@li x: An tensor of type int32, specifying the input.
 *@li deq_scale: An tensor of type float16 or uint64, specifying the scaling ratio . \n
 * @li x: An tensor of type int32, specifying the input.
 * @li deq_scale: An tensor of type uint64, specifying the scaling ratio . \n

 *@par Attributes:
 *@li sqrt_mode: A optional bool, specifying whether to perform square root on "scale", either "True" or "False". Defaults to "False".
 *@li relu_flag: A optional bool, specifying whether to perform ReLU, either "True" or "False". Defaults to "False".
 *@li dtype: A optional int32, specifying the output data type. Defaults to "DT_FLOAT" . \n
 * @li sqrt_mode: A optional bool, specifying whether to perform square root on "scale", either "True" or "False". Defaults to "False".
 * @li relu_flag: A optional bool, specifying whether to perform ReLU, either "True" or "False". Defaults to "False".
 * @li dtype: A optional int32, specifying the output data type. Defaults to "DT_FLOAT" . \n

 *@par Outputs:
 *y: The dequantized output tensor of type float16 or float32. \n
@@ -246,14 +246,14 @@ REG_OP(AscendRequantS16)
 * @brief Quantizes the input of int8 . \n

 * @par Inputs:
 * @li x: An FRACTAL_Z tensor of type int8, specifying the input.
 * @li offset: An FRACTAL_Z tensor of type int8.
 * @li x: A tensor of type int8, specifying the input.
 * @li offset: A tensor of type int8.

 * @par Attributes:
 * @li dst_type: A optional int from: DT_INT8, DT_INT4. Defaults to DT_INT8.

 * @par Outputs:
 * @li y: output tensor of type int4 or int8 and with format FRACTAL_Z.
 * @li y: output tensor of type int4 or int8.

 * @par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe, Onnx, Tensorflow or Pythorch.
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -28,7 +28,7 @@ namespace ge {
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 *x: A 5D Tensor of type float16 or float32, with format NC1HWC0 . \n
 *x: A tensor of type float16 or float32. \n

 *@par Outputs:
 *@li sum: A 1D Tensor of type float32 for SUM reduced "x".
@@ -49,11 +49,11 @@ REG_OP(BNTrainingReduce)
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 *x: A 6D Tensor of type float16 or float32, with format NDC1HWC0 . \n
 *x: A tensor of type float16 or float32. \n

 *@par Outputs:
 *@li sum: A 3D Tensor of type float32 for SUM reduced "x".
 *@li square_sum: A 3D Tensor of type float32 for SUMSQ reduced "x" . \n
 *@li sum: A tensor of type float32 for SUM reduced "x".
 *@li square_sum: A tensor of type float32 for SUMSQ reduced "x" . \n

 *@attention Constraints:
 * This operator is a BatchNorm fusion operator for updating the moving
@@ -71,17 +71,17 @@ REG_OP(BN3DTrainingReduce)

 *@par Inputs:
 * Seven inputs, including:
 *@li grads: A 5D Tensor of type float16 or float32, with format NC1HWC0, for
 *@li grads: A tensor of type float16 or float32, for
 * the gradient.
 *@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0.
 *@li diff_scale: A 5D Tensor of type float32, with format NC1HWC0,
 *@li x: A tensor of type float16 or float32.
 *@li diff_scale: A tensor of type float32,
 * for the mean of "x".
 *@li diff_offset: A 5D Tensor of type float32, with format NC1HWC0,
 *@li diff_offset: A tensor of type float32,
 * for the variance of "x".
 *@li scale: A 5D Tensor of type float32, with format NC1HWC0.
 *@li batch_mean: A 5D Tensor of type float32, with format NC1HWC0,
 *@li scale: A tensor of type float32.
 *@li batch_mean: A tensor of type float32,
 * for the mean of "x".
 *@li batch_variance: A 5D Tensor of type float32, with format NC1HWC0,
 *@li batch_variance: A tensor of type float32,
 * for the variance of "x" . \n

 *@par Attributes:
@@ -89,7 +89,7 @@ REG_OP(BN3DTrainingReduce)
 * added to the variance of "x" . \n

 *@par Outputs:
 *y: A Tensor of type float16 or float32, with format NC1HWC0, for the offset
 *y: A Tensor of type float16 or float32, for the offset
 * of "x" . \n

 *@attention Constraints:
@@ -114,17 +114,17 @@ REG_OP(BNTrainingReduceGrad)

 *@par Inputs:
 * Seven inputs, including:
 *@li grads: A 6D Tensor of type float16 or float32, with format NDC1HWC0, for
 *@li grads: A tensor of type float16 or float32, for
 * the gradient.
 *@li x: A 6D Tensor of type float16 or float32, with format NDC1HWC0.
 *@li diff_scale: A 6D Tensor of type float32, with format NDC1HWC0,
 *@li x: A tensor of type float16 or float32.
 *@li diff_scale: A tensor of type float32,
 * for the mean of "x".
 *@li diff_offset: A 6D Tensor of type float32, with format NDC1HWC0,
 *@li diff_offset: A tensor of type float32,
 * for the variance of "x".
 *@li scale: A 6D Tensor of type float32, with format NDC1HWC0.
 *@li batch_mean: A 6D Tensor of type float32, with format NDC1HWC0,
 *@li scale: A tensor of type float32.
 *@li batch_mean: A tensor of type float32,
 * for the mean of "x".
 *@li batch_variance: A 6D Tensor of type float32, with format NDC1HWC0,
 *@li batch_variance: A tensor of type float32,
 * for the variance of "x" . \n

 *@par Attributes:
@@ -132,7 +132,7 @@ REG_OP(BNTrainingReduceGrad)
 * added to the variance of "x" . \n

 *@par Outputs:
 *y: A Tensor of type float16 or float32, with format NDC1HWC0, for the offset
 *y: A Tensor of type float16 or float32, for the offset
 * of "x" . \n

 *@attention Constraints:
@@ -156,8 +156,8 @@ REG_OP(BN3DTrainingReduceGrad)
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 * Seven inputs, including: (NC1HWC0 supported)
 *@li x: A 5D Tensor of type float16 or float32.
 * Seven inputs, including:
 *@li x: A tensor of type float16 or float32.
 *@li sum: A 1D Tensor of type float32 for the output of operator
 * BNTrainingReduce.
 *@li square_sum: A 1D Tensor of type float32 for the output of operator
@@ -174,10 +174,10 @@ REG_OP(BN3DTrainingReduceGrad)
 * and variance . \n

 *@par Outputs:
 * Five outputs, including: (NC1HWC0 supported)
 *@li y: A 5D Tensor of type float16 or float32, for normalized "x".
 *@li mean: A 5D Tensor of type float32, for the updated mean.
 *@li variance: A 5D Tensor of type float32, for the updated variance.
 * Five outputs, including:
 *@li y: A tensor of type float16 or float32, for normalized "x".
 *@li mean: A tensor of type float32, for the updated mean.
 *@li variance: A tensor of type float32, for the updated variance.
 *@li batch_mean: A 1D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 1D Tensor of type float32, for the variance of "x" . \n

@@ -209,16 +209,16 @@ REG_OP(BNTrainingUpdate)
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 * Seven inputs, including: (NDC1HWC0 supported)
 *@li x: A 6D Tensor of type float16 or float32.
 *@li sum: A 6D Tensor of type float32 for the output of operator
 * Seven inputs, including:
 *@li x: A tensor of type float16 or float32.
 *@li sum: A tensor of type float32 for the output of operator
 * BN3DTrainingUpdate.
 *@li square_sum: A 6D Tensor of type float32 for the output of operator
 *@li square_sum: A tensor of type float32 for the output of operator
 * BN3DTrainingUpdate.
 *@li scale: A 6D Tensor of type float32, for the scaling factor.
 *@li offset: A 6D Tensor of type float32, for the scaling offset.
 *@li mean: A 6D Tensor of type float32, for the updated mean.
 *@li variance: A 6D Tensor of type float32, for the updated variance . \n
 *@li scale: A tensor of type float32, for the scaling factor.
 *@li offset: A tensor of type float32, for the scaling offset.
 *@li mean: A tensor of type float32, for the updated mean.
 *@li variance: A tensor of type float32, for the updated variance . \n

 *@par Attributes:
 *@li epsilon: A required float32, specifying the small value added to variance
@@ -227,12 +227,12 @@ REG_OP(BNTrainingUpdate)
 * and variance . \n

 *@par Outputs:
 * Five outputs, including: (NDC1HWC0 supported)
 *@li y: A 6D Tensor of type float16 or float32, for normalized "x".
 *@li mean: A 6D Tensor of type float32, for the updated mean.
 *@li variance: A 6D Tensor of type float32, for the updated variance.
 *@li batch_mean: A 6D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 6D Tensor of type float32, for the variance of "x" . \n
 * Five outputs, including:
 *@li y: A tensor of type float16 or float32, for normalized "x".
 *@li mean: A tensor of type float32, for the updated mean.
 *@li variance: A tensor of type float32, for the updated variance.
 *@li batch_mean: A tensor of type float32, for the mean of "x".
 *@li batch_variance: A tensor of type float32, for the variance of "x" . \n

 *@attention Constraints:
 *@li This operator is a BatchNorm fusion operator for updating the moving
@@ -262,19 +262,19 @@ REG_OP(BN3DTrainingUpdate)
 *@brief Performs batch normalization for inference . \n

 *@par Inputs:
 * Five inputs, including: (NC1HWC0 supported)
 *@li x: A 5D Tensor of type float16 or float32.
 *@li scale: A 5D Tensor of type float32, for the scaling factor.
 *@li offset: A 5D Tensor of type float32, for the scaling offset.
 *@li mean: A 5D Tensor of type float32, for the mean.
 *@li variance: A 5D Tensor of type float32, for the variance . \n
 * Five inputs, including:
 *@li x: A tensor of type float16 or float32.
 *@li scale: A tensor of type float32, for the scaling factor.
 *@li offset: A tensor of type float32, for the scaling offset.
 *@li mean: A tensor of type float32, for the mean.
 *@li variance: A tensor of type float32, for the variance . \n

 *@par Attributes:
 *epsilon: An optional float32, specifying the small value added to variance to
 * avoid dividing by zero. Defaults to "0.0001" . \n

 *@par Outputs:
 *y: A 5D Tensor of type float16 or float32 for the normalized "x" . \n
 *y: A tensor of type float16 or float32 for the normalized "x" . \n

 *@attention Constraints:
 *For Ascend 310, the result accuracy fails to reach 1/1000 due to the square root
@@ -295,21 +295,21 @@ REG_OP(BNInfer)
 assignmoving average . \n

 *@par Inputs:
 *Five inputs, including: (NC1HWC0 supported)
 *@li x: A 5D Tensor of type float16 or float32.
 *@li sum: A 5D Tensor of type float32 for the output of operator BNTrainingReduce.
 *@li square_sum: A 5D Tensor of type float32 for the output of operator BNTrainingReduce.
 *@li scale: A 5D Tensor of type float32, for the scaling factor.
 *@li offset: A 5D Tensor of type float32, for the scaling offset . \n
 *Five inputs, including:
 *@li x: A tensor of type float16 or float32.
 *@li sum: A tensor of type float32 for the output of operator BNTrainingReduce.
 *@li square_sum: A tensor of type float32 for the output of operator BNTrainingReduce.
 *@li scale: A tensor of type float32, for the scaling factor.
 *@li offset: A tensor of type float32, for the scaling offset . \n

 *@par Attributes:
 *epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero . \n

 *@par Outputs:
 *Three outputs, including: (NC1HWC0 supported)
 *@li y: A 5D Tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A 5D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 5D Tensor of type float32, for the variance of "x" . \n
 *Three outputs, including:
 *@li y: A tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A tensor of type float32, for the mean of "x".
 *@li batch_variance: A tensor of type float32, for the variance of "x" . \n

 *@attention Constraints:
 *This operator is used in conjunction with BNTrainingReduce.
@@ -332,22 +332,22 @@ REG_OP(BNTrainingUpdateV2)
 assign moving average . \n

 *@par Inputs:
 * Five inputs, including: (NC1HWC0 supported)
 *@li x: A 5D Tensor of type float16 or float32.
 *@li sum: A 5D Tensor of type float32 for the output of operator BNTrainingReduce.
 *@li square_sum: A 5D Tensor of type float32 for the output of operator BNTrainingReduce.
 *@li scale: A 5D Tensor of type float32, for the scaling factor.
 *@li offset: A 5D Tensor of type float32, for the scaling offset . \n
 * Five inputs, including:
 *@li x: A tensor of type float16 or float32.
 *@li sum: A tensor of type float32 for the output of operator BNTrainingReduce.
 *@li square_sum: A tensor of type float32 for the output of operator BNTrainingReduce.
 *@li scale: A tensor of type float32, for the scaling factor.
 *@li offset: A tensor of type float32, for the scaling offset . \n

 *@par Attributes:
 *epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero . \n

 *@par Outputs:
 *@li y: A 5D Tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A 5D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 5D Tensor of type float32, for the variance of "x".
 *@li reserve_1: A 5D Tensor of type float32, for the mean of batch "x". Has the same type as batch_mean.
 *@li reserve_2: A 5D Tensor of type float32, for the variance of batch "x". Has the same type as batch_mean . \n
 *@li y: A tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A tensor of type float32, for the mean of "x".
 *@li batch_variance: A tensor of type float32, for the variance of "x".
 *@li reserve_1: A tensor of type float32, for the mean of batch "x". Has the same type as batch_mean.
 *@li reserve_2: A tensor of type float32, for the variance of batch "x". Has the same type as batch_mean . \n

 *@attention Constraints:
 *@li This operator is used in conjunction with BNTrainingReduce.
@@ -372,12 +372,12 @@ REG_OP(BNTrainingUpdateV3)

 *@par Inputs:
 * Four inputs, including:
 *@li grads: A 5D Tensor of type float16 or float32, with format NC1HWC0,
 *@li grads: A tensor of type float16 or float32,
 * for the gradient.
 *@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0.
 *@li batch_mean: A 5D Tensor of type float32, with format NC1HWC0,
 *@li x: A tensor of type float16 or float32.
 *@li batch_mean: A tensor of type float32,
 * for the mean of "x".
 *@li batch_variance: A 5D Tensor of type float32, with format NC1HWC0,
 *@li batch_variance: A tensor of type float32,
 * for the variance of "x" . \n

 *@par Attributes:
@@ -385,9 +385,9 @@ REG_OP(BNTrainingUpdateV3)
 * added to the variance of "x" . \n

 *@par Outputs:
 *@li diff_scale: A Tensor of type float32, with format NC1HWC0,
 *@li diff_scale: A Tensor of type float32,
 * for the offset of "scale".
 *@li diff_offset: A Tensor of type float32, with format NC1HWC0,
 *@li diff_offset: A Tensor of type float32,
 * for the offset of "offset" . \n

 */
@@ -406,12 +406,12 @@ REG_OP(BNTrainingUpdateGrad)

 *@par Inputs:
 * Four inputs, including:
 *@li grads: A 6D Tensor of type float16 or float32, with format NDC1HWC0,
 *@li grads: A tensor of type float16 or float32,
 * for the gradient.
 *@li x: A 6D Tensor of type float16 or float32, with format NDC1HWC0.
 *@li batch_mean: A 6D Tensor of type float32, with format NDC1HWC0,
 *@li x: A tensor of type float16 or float32.
 *@li batch_mean: A tensor of type float32,
 * for the mean of "x".
 *@li batch_variance: A 6D Tensor of type float32, with format NDC1HWC0,
 *@li batch_variance: A tensor of type float32,
 * for the variance of "x" . \n

 *@par Attributes:
@@ -419,9 +419,9 @@ REG_OP(BNTrainingUpdateGrad)
 * added to the variance of "x" . \n

 *@par Outputs:
 *@li diff_scale: A Tensor of type float32, with format NDC1HWC0,
 *@li diff_scale: A Tensor of type float32,
 * for the offset of "scale".
 *@li diff_offset: A Tensor of type float32, with format NDC1HWC0,
 *@li diff_offset: A Tensor of type float32,
 * for the offset of "offset" . \n

 */
@@ -440,15 +440,15 @@ REG_OP(BN3DTrainingUpdateGrad)

 *@par Inputs:
 * Three inputs, including:
 *@li grads: A 5D Tensor of type loat16 or float32, with format NC1HWC0, for the gradient.
 *@li scale: A 5D Tensor of type float32, with format NC1HWC0.
 *@li batch_variance: A 5D Tensor of type float32, with format NC1HWC0. It is an output of BatchNorm . \n
 *@li grads: A tensor of type loat16 or float32, for the gradient.
 *@li scale: A tensor of type float32.
 *@li batch_variance: A tensor of type float32. It is an output of BatchNorm . \n

 *@par Attributes:
 *epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x" . \n

 *@par Outputs:
 *x_backprop: A Tensor of type float16 or float32, with format NC1HWC0, for the offset of "x" . \n
 *x_backprop: A Tensor of type float16 or float32, for the offset of "x" . \n

 *@attention Constraints:
 * The preceding layer of this operator must be operator BatchNorm.
@@ -663,6 +663,9 @@ REG_OP(ReduceProdD)
 *keep_dims: A bool or NoneType.
 * - If true, retains reduced dimensions with length 1.
 * - If false, the rank of the tensor is reduced by 1 for each entry in axis.
 *noop_with_empty_axes: A bool.
 * - If true, when axes = [], not reduce.
 * - If false, when axes = [], reduce all.
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n

@@ -674,6 +677,7 @@ REG_OP(ReduceMean)
    .INPUT(axes, TensorType::IndexNumberType())
    .OUTPUT(y, TensorType::NumberType())
    .ATTR(keep_dims, Bool, false)
    .ATTR(noop_with_empty_axes, Bool, true)
    .OP_END_FACTORY_REG(ReduceMean)

 /**
@@ -690,6 +694,9 @@ REG_OP(ReduceMean)
 *@li keep_dims: A bool or NoneType.
 * - If true, retains reduced dimensions with length 1.
 * - If false, the rank of the tensor is reduced by 1 for each entry in axis.
 *@li noop_with_empty_axes: A bool default False.
 * - If true, same as tf.
 * - If false, when x's shape is [], reduce all dims, for onnx.
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n

@@ -704,6 +711,7 @@ REG_OP(ReduceMeanD)
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(axes, ListInt)
    .ATTR(keep_dims, Bool, false)
    .ATTR(noop_with_empty_axes, Bool, false)
    .OP_END_FACTORY_REG(ReduceMeanD)

 /**
@@ -983,7 +991,7 @@ REG_OP(EuclideanNormD)
 *@brief Performs instance normalization for inference . \n

 *@par Inputs:
 * Five inputs, including: (NC1HWC0 supported)
 * Five inputs, including:
 *@li x: A Tensor of type float16 or float32.
 *@li gamma: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling gamma.
 *@li beta: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling beta.
@@ -1184,22 +1192,22 @@ REG_OP(GNTrainingReduce)
 *@par Inputs:
 * Seven inputs, including: (NCHW NHWC supported)
 *@li x: A Tensor of type float16 or float32.
 *@li sum: A 5D Tensor of type float32,
 *@li sum: A tensor of type float32,
 shape is [N, G, 1, 1, 1] for NCHW, [N, 1, 1, G, 1] for NHWC
 for the output of operator GNTrainingReduce.
 *@li square_sum: A 5D Tensor of type float32,
 *@li square_sum: A tensor of type float32,
 shape is [N, G, 1, 1, 1] for NCHW, [N, 1, 1, G, 1] for NHWC
 for the output of operator GNTrainingReduce.
 *@li scale: A 5D Tensor of type float32,
 *@li scale: A tensor of type float32,
 shape is [1, G, 1, 1, 1] for NCHW, [1, 1, 1, G, 1] for NHWC
 is for the scaling gamma.
 *@li offset: A 5D Tensor of type float32,
 *@li offset: A tensor of type float32,
 shape is [1, G, 1, 1, 1] for NCHW, [1, 1, 1, G, 1] for NHWC
 for the scaling beta.
 *@li mean: A 5D Tensor of type float32,
 *@li mean: A tensor of type float32,
 shape is [N, G, 1, 1, 1] for NCHW, [N, 1, 1, G, 1] for NHWC
 for the updated mean.
 *@li variance: A 5D Tensor of type float32,
 *@li variance: A tensor of type float32,
 shape is [N, G, 1, 1, 1] for NCHW, [N, 1, 1, G, 1] for NHWC
 for the updated variance.

@@ -1209,7 +1217,7 @@ for the updated variance.
 *@li num_groups: Int, specifying the num of groups. required, same to GNTrainingReduce

 *@par Outputs:
 * Three outputs, including: (NC1HWC0 supported)
 * Three outputs, including:
 *@li y: A Tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A Tensor of type float32, for the updated mean.
 *@li batch_variance: A Tensor of type float32, for the updated variance . \n
@@ -1338,7 +1346,7 @@ REG_OP(ReduceStdWithMean)
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 *x: A 5D Tensor of type float16 or float32, with format NC1HWC0 . \n
 *x: A tensor of type float16 or float32 . \n

 *@par Outputs:
 *@li mean: A Tensor of type float32 for SUM reduced "x".
--- a/third_party/fwkacllib/inc/ops/rnn.h
+++ b/third_party/fwkacllib/inc/ops/rnn.h
--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@@ -940,11 +940,13 @@ REG_OP(SliceDV2)
 * @par Attributes:
 * @li k: A required int that is at least 0, specifying the number of top elements
 * to look for along the last dimension (along each row for matrices).
 * @li sorted: An optional bool. Defaults to true.
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li sorted: An optional bool. Defaults to "True".
 * If "True", the returned "k" elements are themselves sorted.
 * If "False", the returned "k" elements are not sorted.
 * @li dim: An optional int. Defaults to -1. For reserved use.
 * @li largest: An optional bool. Defaults to true. For reserved use. \n
 * @li largest: An optional bool, controls whether to return largest or smallest elements. Defaults to true.
 * If "True", the "k" largest elements are returned in descending order.
 * If "False", the "k" smallest elements are returned in ascending order. \n

 * @par Outputs:
 * @li values: A Tensor, specifying the sorted data. Has the same type as "input".
@@ -989,11 +991,14 @@ REG_OP(TopKD)
 * int32 to float16. \n

 * @par Attributes:
 * @li sorted: An optional bool. Defaults to true.
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li sorted: An optional bool. Defaults to "True".
 * If "True", the returned "k" elements are themselves sorted.
 * If "False", the returned "k" elements are not sorted.
 * @li dim: An optional int. Defaults to -1. For reserved use.
 * @li largest: An optional bool. Defaults to true. For reserved use. \n
 * @li largest: An optional bool, controls whether to return largest or smallest elements. Defaults to true.
 * If "True", the "k" largest elements are returned in descending order.
 * If "False", the "k" smallest elements are returned in ascending order. \n


 * @par Outputs:
 * @li values: A Tensor, specifying the sorted data. Has the same type as
@@ -1028,11 +1033,13 @@ REG_OP(TopKV2D)
 * for matrices) . \n

 * @par Attributes:
 * @li sorted: An optional bool. Defaults to true.
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li sorted: An optional bool. Defaults to "True".
 * If "True", the returned "k" elements are themselves sorted.
 * If "False", the returned "k" elements are not sorted.
 * @li dim: An optional int. Defaults to -1. For reserved use.
 * @li largest: An optional bool. Defaults to true. For reserved use. \n
 * @li largest: An optional bool, controls whether to return largest or smallest elements. Defaults to true.
 * If "True", the "k" largest elements are returned in descending order.
 * If "False", the "k" smallest elements are returned in ascending order. \n

 * @par Outputs:
 * @li values: A Tensor, specifying the sorted data. Has the same type as
@@ -1066,10 +1073,12 @@ REG_OP(TopKV2)
 * for matrices) . \n

 * @par Attributes:
 * @li sorted: Defaults to true.
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li largest:If true the resulting `k` elements will be sorted by the values in descending order.
 * @li sorted: An optional bool. Defaults to "True".
 * If "True", the returned "k" elements are themselves sorted.
 * If "False", the returned "k" elements are not sorted.
 * @li largest: An optional bool, controls whether to return largest or smallest elements. Defaults to true.
 * If "True", the "k" largest elements are returned in descending order.
 * If "False", the "k" smallest elements are returned in ascending order.
 * @li dim:0-D. Number of top elements to look for along the last dimension (along each row for matrices). \n

 * @par Outputs:
@@ -2534,32 +2543,75 @@ REG_OP(StridedSliceV3)
    .OP_END_FACTORY_REG(StridedSliceV3)

 /**
 *@brief MovingSumWithSigmoid.
 * @brief Sum the alpha according to the offset and ksize,
    and quadrature it with the sigmoid value of energy. \n

 *@par Inputs:
 *Four inputs, including:
 * @par Inputs:
 * Three inputs, including:
 * @li alpha: A Tensor. Must be one of the following types: float32, float16.
 * @li energy: A Tensor. Must be one of the following types: float32, float16.
 * @li beam_size: A Tensor of type int32.
 * @li frame_size: A Tensor of type int32. \n
 * @li offset: A Tensor of type int32. \n

 *@par Outputs:
 * y: A Tensor. Has the same type as "alpha". \n
 * y: A Tensor with same type as "alpha". \n
 *
 * @par Attributes:
 * window_size: A int.
 * ksize: A int.
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(MovingSumWithSigmoid)
    .INPUT(alpha, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(energy, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(beam_size, TensorType({DT_INT32}))
    .INPUT(frame_size, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(window_size, Int)
    .INPUT(alpha, TensorType::BasicType())
    .INPUT(energy, TensorType::BasicType())
    .INPUT(offset, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType::BasicType())
    .REQUIRED_ATTR(ksize, Int)
    .OP_END_FACTORY_REG(MovingSumWithSigmoid)

 /**
 * @brief Choose the value of X with value according to mask.

 * @par Inputs:
 * two inputs, including:
 * @li x: A Tensor of dtype is BasicType.
 * @li mask: A Tensor of dtype is bool. \n

 * @par Outputs:
 * y: A tensor with the same type as x. \n

 * @par Third-party framework compatibility
 * Compatible with the Numpy operator select.\n
 */
 REG_OP(MaskedSelect)
    .INPUT(x, TensorType::BasicType())
    .INPUT(mask, TensorType({DT_BOOL}))
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(MaskedSelect)

 /**
 * @brief Sum X1 and X2 according to the offset recorded in seq_len1 and seq_len2. \n

 * @par Inputs:
 * Four inputs, including:
 * @li x1: A Tensor. Support BasicType.
 * @li x2: A Tensor. Support BasicType.
 * @li seq_len1: A Tensor. Support int32.
 * @li seq_len2: A Tensor. Support int32. \n

 * @par Outputs:
 * y: A Tensor with same type as "x1". \n

 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DynSeqOuter)
    .INPUT(x1, TensorType::BasicType())
    .INPUT(x2, TensorType::BasicType())
    .INPUT(seq_len1, TensorType({DT_INT32}))
    .INPUT(seq_len2, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(DynSeqOuter)
 } // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_SELECTION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/split_combination_ops.h
+++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h
@@ -188,7 +188,7 @@ REG_OP(ParallelConcat)

 *@par Inputs:
 * One input:
 *x: Dynamic input.An NC1HWC0 or ND Tensor.
 *x: Dynamic input.A ND Tensor.
 *Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64

 *@par Attributes:
@@ -219,7 +219,7 @@ REG_OP(ConcatV2D)

 *@par Inputs:
 * Two inputs, including:
 *@li Dynamic input "x" is An NC1HWC0 or ND Tensor.
 *@li Dynamic input "x" is A ND Tensor.
 *Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64
 *@li concat_dim: An int32, or int64. Specifies the dimension along which to concatenate . \n

@@ -247,7 +247,7 @@ REG_OP(ConcatV2)

 *@par Inputs:
 * One input:
 *x:Dynamic input. An NC1HWC0 or ND Tensor.
 *x:Dynamic input. A ND Tensor.
 *Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64

 *@par Attributes:
@@ -278,7 +278,7 @@ REG_OP(ConcatD)

 *@par Inputs:
 * Two inputs, including:
 *@li x: Dynamic input.An NC1HWC0 or ND Tensor.
 *@li x: Dynamic input.A ND Tensor.
 *Must be one of the following types: float16, float32, double, int32,
 *     uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16,
 *     complex128, uint32, uint64, qint16, quint16.
--- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
@@ -236,6 +236,30 @@ REG_OP(StatefulUniformInt)
    .OUTPUT(y, TensorType({DT_INT64}))
    .OP_END_FACTORY_REG(StatefulUniformInt)

 /**
 * @brief Advance the counter of a counter-based RNG. The state of the RNG after
 * `rng_skip(n)` will be the same as that after `stateful_uniform([n])`
 * (or any other distribution). The actual increment added to the
 * counter is an unspecified implementation detail . \n

 * @par Inputs:
 * @li value: Stores the state of the RNG.
 * @li algorithm: The RNG algorithm.
 * @li delta: The amount of advancement . \n

 * @par Outputs:
 * value:A Returns Random values with specified shape . \n

 * @par Third-party framework compatibility
 * Compatible with tensorflow RngReadAndSkipV2 operator.
 */

 REG_OP(RngReadAndSkipV2)
    .INPUT(value, TensorType({DT_INT64}))
    .INPUT(algorithm, TensorType({DT_INT32}))
    .INPUT(delta, TensorType({DT_UINT64}))
    .OUTPUT(value, TensorType({DT_INT64}))
    .OP_END_FACTORY_REG(RngReadAndSkipV2)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_STATEFUL_RANDOM_OPS_H_
--- a/third_party/fwkacllib/inc/ops/target_crop_and_resize.h
+++ b/third_party/fwkacllib/inc/ops/target_crop_and_resize.h
@@ -39,7 +39,7 @@ namespace ge {
 *input_format: A required string, specifying the input format. \n

 *@par Outputs:
 *y: The output tensor of type uint8, format only support NC1HWC0_C04.
 *y: The output tensor of type uint8.
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
 *
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -138,9 +138,11 @@ REG_OP(Transpose)
 * For branches without padding also can be types: int16, int64, uint8, uint16, uint32, uint64 . \n

 *@par Attributes:
 *@li src_format: A string source data format, can be "NHWC", "NCHW", "FRACTAL_Z" etc.
 *@li dst_format: A string target data format, can be "NC1HWC0", "NCHW", "FRACTAL_Z" etc.
 *@li group: A optional int32, default value is 1. \n
 *@li src_format: A string source data format, can be "NHWC", "NCHW" etc.
 *@li dst_format: A string target data format, can be "NCHW" etc.
 *@li src_subformat: A optional int32 for source sub-format, default value is 0.
 *@li dst_subformat: A optional int32 for target sub-format, default value is 0.
 *@li groups: A optional int32, default value is 1. \n

 *@par Outputs:
 *dst: A Tensor. Has the same type as "src".
@@ -150,6 +152,8 @@ REG_OP(TransData)
    .OUTPUT(dst, TensorType::BasicType())
    .REQUIRED_ATTR(src_format, String)
    .REQUIRED_ATTR(dst_format, String)
    .ATTR(src_subformat, Int, 0)
    .ATTR(dst_subformat, Int, 0)
    .ATTR(groups, Int, 1)
    .OP_END_FACTORY_REG(TransData)

@@ -236,13 +240,13 @@ REG_OP(Flatten)

 *@par Inputs:
 * Three inputs, including:
 *@li x: A 5D Tensor of type float16 or int8 or uint8, with format NC1HWC0.
 *@li x: A 5D Tensor of type float16 or int8 or uint8.
 *@li block_shape: A 1D list or tuple of int32 or int64.
 *@li crops: A 2D list or tuple of int32 or int64. Specifies the amount to
 *crop from start and end dimensions after permutation . \n

 *@par Outputs:
 *y: A Tensor with format NC1HWC0. Has the same type as input "x" . \n
 *y: A Tensor has the same type as input "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchToSpaceND.
@@ -259,7 +263,7 @@ REG_OP(BatchToSpaceND)

 *@par Inputs:
 * One input:
 *x: A 5D Tensor of type float16 or int8 or uint8, with format NC1HWC0 . \n
 *x: A 5D Tensor of type float16 or int8 or uint8. \n

 *@par Attributes:
 *@li block_shape: A required 1D list or tuple of int32 or int64.
@@ -267,7 +271,7 @@ REG_OP(BatchToSpaceND)
 * from the start and end dimensions after permutation . \n

 *@par Outputs:
 *y: A Tensor with format NC1HWC0. Has the same type as input "x".
 *y: A Tensor has the same type as input "x".


 *@par Third-party framework compatibility
@@ -288,12 +292,12 @@ REG_OP(BatchToSpaceNDD)

 *@par Inputs:
 * Three inputs, including:
 *@li x: A 5D Tensor of type float16 or float32, with format NC1HWC0.
 *@li x: A 5D Tensor of type float16 or float32.
 *@li block_shape: A 1D list or tuple of int32 or int64.
 *@li paddings: A 2D list or tuple of int32 or int64. Specifies the padding for the start and end dimensions after permutation . \n

 *@par Outputs:
 *y: A Tensor with format NC1HWC0. Has the same type as input "x" . \n
 *y: A Tensor has the same type as input "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SpaceToBatchND.
@@ -310,14 +314,14 @@ REG_OP(SpaceToBatchND)

 *@par Inputs:
 * One input:
 *x: A 5D Tensor of type float16 or float32, with format NC1HWC0 . \n
 *x: A 5D Tensor of type float16 or float32. \n

 *@par Attributes:
 *@li block_shape: A required 1D list or tuple of int32 or int64.
 *@li paddings: A required 2D list or tuple of int32 or int64. Specifies the padding for the start and end dimensions after permutation . \n

 *@par Outputs:
 *y: A Tensor with format NC1HWC0. Has the same type as input "x" . \n
 *y: A Tensor has the same type as input "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SpaceToBatchND.
@@ -516,7 +520,7 @@ REG_OP(SpaceToBatchD)
 * tensors . \n

 * @par Inputs:
 * x: A rank-R tensor (R > 0) of type BasicType, with format ND or NC1HWC0 . \n
 * x: A rank-R tensor (R > 0) of type BasicType. \n

 * @par Attributes:
 * @li num: A required int, specifying the number of tensors to be unpacked to.
@@ -529,8 +533,7 @@ REG_OP(SpaceToBatchD)

 * @attention Constraints:
 * @li If "num" is not specified, it is inferred from the shape of "x".
 * @li For the ND format, "axis" is in the range [-R, R); For the NC1HWC0 format,
 * "axis" must not be 2, 3, -2, or -3 . \n
 * @li For the ND format, "axis" is in the range [-R, R). \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator Unpack.
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -140,6 +140,16 @@ typedef void *rtModel_t;

 #define RT_PROF_MAX_DEV_NUM 64

 #define PATH_LEN_MAX 1023
 #define PARAM_LEN_MAX 4095
 typedef struct rtCommandHandleParams {
    uint32_t pathLen;
    uint32_t storageLimit;  // MB
    uint32_t profDataLen;
    char_t path[PATH_LEN_MAX + 1];
    char_t profData[PARAM_LEN_MAX + 1];
 } rtCommandHandleParams_t;

 /**
 * @ingroup profiling_base
 * @brief profiling command info
@@ -151,6 +161,7 @@ typedef struct rtProfCommandHandle {
    uint32_t devIdList[RT_PROF_MAX_DEV_NUM];
    uint32_t modelId;
    uint32_t type;
    rtCommandHandleParams_t commandHandleParams;
 } rtProfCommandHandle_t;

 /**
@@ -251,6 +262,16 @@ RTS_API rtError_t rtProfSetProSwitch(void *data, uint32_t len);
 */
 RTS_API rtError_t rtProfRegisterCtrlCallback(uint32_t moduleId, rtProfCtrlHandle callback);

 /**
 * @ingroup profiling_base
 * @brief set profling switch, called by profiling
 * @param [in]  data  rtProfilingCommandHandle
 * @param [in]  len   length of data
 * @return RT_ERROR_NONE for ok
 * @return ACL_ERROR_RT_PARAM_INVALID for error input
 */
 RTS_API rtError_t rtProfilingCommandHandle(uint32_t type, void *data, uint32_t len);

 /**
 * @ingroup dvrt_base
 * @brief register callback for error code
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -36,7 +36,8 @@ typedef enum tagRtChipType {
    CHIP_CLOUD_V2 = 5,
    CHIP_NO_DEVICE = 6,
    CHIP_MINI_V3 = 7,
    CHIP_END = 8,
    CHIP_5612 = 8, /* 1911T */
    CHIP_END = 9,
 } rtChipType_t;

 typedef enum tagRtAicpuScheType {
@@ -77,7 +78,8 @@ typedef enum tagRtPlatformType {
    PLATFORM_CLOUD_V2 = 6,
    PLATFORM_LHISI_SD3403 = 7,
    PLATFORM_MINI_V3 = 8,
    PLATFORM_END = 9,
    PLATFORM_MINI_5612 = 9,
    PLATFORM_END = 10,
 } rtPlatformType_t;

 typedef enum tagRtCubeFracMKNFp16 {
--- a/third_party/fwkacllib/inc/runtime/dvfsprofile.h
+++ b/third_party/fwkacllib/inc/runtime/dvfsprofile.h
@@ -1,53 +0,0 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved.
 * Description: dvfsprofile.h
 * Create: 2020-01-01
 */

 #ifndef CCE_RUNTIME_DVFSPROFILE_H
 #define CCE_RUNTIME_DVFSPROFILE_H

 #include "base.h"

 #if defined(__cplusplus)
 extern "C" {
 #endif

 typedef enum dvfsProfileMode {
  DVFS_PROFILE_PERFORMANCE_PRIORITY,
  DVFS_PROFILE_BALANCE_PRIORITY,
  DVFS_PROFILE_POWER_PRIORITY,
  DVFS_PROFILE_PRIORITY_MAX
 } DvfsProfileMode;

 /**
 * @ingroup dvrt_dvfsprofile
 * @brief Set the performance mode of the device
 * @param [in] profMode   dvfsProfileMode
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtSetDvfsProfile(DvfsProfileMode profMode);

 /**
 * @ingroup dvrt_dvfsprofile
 * @brief Set the performance mode of the device
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for invalid value
 */
 RTS_API rtError_t rtUnsetDvfsProfile();

 /**
 * @ingroup dvrt_dvfsprofile
 * @brief Get the current performance mode of the device
 * @param [in|out] pmode   dvfsProfileMode type pointer
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtGetDvfsProfile(DvfsProfileMode *pmode);

 #if defined(__cplusplus)
 }
 #endif

 #endif  // CCE_RUNTIME_DVFSPROFILE_H
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -130,6 +130,32 @@ typedef struct tagRtArgsWithTiling {
    uint8_t reserved[6];
 } rtArgsWithTiling_t;

 /**
 * @ingroup rt_kernel
 * @brief host memory input struct
 */
 typedef struct rtHostInputInfo {
    uint16_t addrOffset;
    uint16_t dataOffset;
 } rtHostInputInfo_t;

 /**
 * @ingroup rt_kernel
 * @brief args struct
 */
 typedef struct tagRtArgsEx {
    void *args;                     // args host mem addr
    rtHostInputInfo_t *hostInputInfoPtr;     // nullptr means no host mem input
    uint32_t argsSize;              // input + output + tiling addr size + tiling data size + host mem
    uint16_t tilingAddrOffset;      // tiling addr offset
    uint16_t tilingDataOffset;      // tiling data offset
    uint16_t hostInputInfoNum;      // hostInputInfo num
    uint8_t hasTiling;              // if has tiling: 0 means no tiling
    uint8_t isNoNeedH2DCopy;        // is no need host to device copy: 0 means need H2D copy,
                                    // others means doesn't need H2D copy.
    uint8_t reserved[4];
 } rtArgsEx_t;

 /**
 * @ingroup rt_KernelConfigDump
 * @brief device dump type
@@ -208,6 +234,7 @@ typedef void (*rtCallback_t)(void *fnData);
 #define RT_KERNEL_DEVICE_FIRST (0x10U)
 #define RT_KERNEL_HOST_ONLY (0x20U)
 #define RT_KERNEL_HOST_FIRST (0x40U)
 #define RT_KERNEL_BIUPERF_FLAG (0x80U)

 /**
 * @ingroup rt_kernel
@@ -375,36 +402,33 @@ RTS_API rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *
 /**
 * @ingroup rt_kernel
 * @brief launch kernel with handle to device
 * @param [in] hdl   program
 * @param [in] kernelInfoExt   kernel Info extension. device function description or tiling key,
 *                             depending static shape or dynmaic shape.
 * @param [in] blockDim   block dimentions
 * @param [in] args   argments address for kernel function
 * @param [in] argsSize   argements size
 * @param [in] smDesc   shared memory description
 * @param [in] stm   associated stream
 * @param [in] kernelInfo   kernel info
 * @param [in] hdl             program
 * @param [in] tilingKey       tilingKey
 * @param [in] blockDim        block dimentions
 * @param [in] argsInfo        argments address for kernel function
 * @param [in] smDesc          shared memory description
 * @param [in] stm             associated stream
 * @param [in] kernelInfo      kernel info
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtKernelLaunchWithHandle(void *hdl, const void *kernelInfoExt, uint32_t blockDim,
                                           void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream_,
 RTS_API rtError_t rtKernelLaunchWithHandle(void *hdl, const uint64_t tilingKey, uint32_t blockDim,
                                           rtArgsEx_t *argsInfo, rtSmDesc_t *smDesc, rtStream_t stm,
                                           const void *kernelInfo);

 /**
 * @ingroup rt_kernel
 * @ingroup rtKernelLaunchWithFlag
 * @brief launch kernel to device
 * @param [in] stubFunc   stub function
 * @param [in] blockDim   block dimentions
 * @param [in] args   argments address for kernel function
 * @param [in] argsSize   argements size
 * @param [in] smDesc   shared memory description
 * @param [in] stm   associated stream
 * @param [in] flag   dump flag
 * @param [in] argsInfo   argments address for kernel function
 * @param [in] smDesc     shared memory description
 * @param [in] stm        associated stream
 * @param [in] flags      dump flag
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtKernelLaunchWithFlag(const void *stubFunc, uint32_t blockDim, void *args, uint32_t argsSize,
 RTS_API rtError_t rtKernelLaunchWithFlag(const void *stubFunc, uint32_t blockDim, rtArgsEx_t *argsInfo,
                                         rtSmDesc_t *smDesc, rtStream_t stm, uint32_t flags);

 /**
@@ -465,38 +489,37 @@ RTS_API rtError_t rtAicpuKernelLaunch(const rtKernelLaunchNames_t *launchNames,
    uint32_t blockDim, const void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stm);

 /**
 * @ingroup rt_kernel(abandoned)
 * @ingroup rtCpuKernelLaunchWithFlag(abandoned)
 * @brief launch cpu kernel to device  with dump identifier
 * @param [in] soName        so name
 * @param [in] kernelName    kernel name
 * @param [in] blockDim      block dimentions
 * @param [in] args          argments address for kernel function
 * @param [in] argsSize      argments size
 * @param [in] argsInfo      argments address for kernel function
 * @param [in] smDesc        shared memory description
 * @param [in] stm        associated stream
 * @param [in] stm           associated stream
 * @param [in] flag          dump flag or others function flag
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtCpuKernelLaunchWithFlag(const void *soName, const void *kernelName, uint32_t blockDim,
                                            const void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stm,
                                            const rtArgsEx_t *argsInfo, rtSmDesc_t *smDesc, rtStream_t stm,
                                            uint32_t flags);

 /**
 * @ingroup rt_kernel(in use)
 * @ingroup rtAicpuKernelLaunchWithFlag(in use)
 * @brief launch cpu kernel to device  with dump identifier
 * @param [in] launchNames   names for kernel launch
 * @param [in] blockDim      block dimentions
 * @param [in] args          argments address for kernel function
 * @param [in] argsSize      argments size
 * @param [in] smDesc        shared memory description
 * @param [in] stm        associated stream
 * @param [in] flag          dump flag or others function flag
 * @param [in] stm           associated stream
 * @param [in] flags          dump flag or others function flag
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtAicpuKernelLaunchWithFlag(const rtKernelLaunchNames_t *launchNames, uint32_t blockDim,
    const void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stm, uint32_t flags);
                                              const rtArgsEx_t *argsInfo, rtSmDesc_t *smDesc, rtStream_t stm,
                                              uint32_t flags);

 /**
 * @ingroup rt_kernel
@@ -702,37 +725,6 @@ RTS_API rtError_t rtStartMDCProfiler(void **addr, uint32_t length);
 */
 RTS_API rtError_t rtStopMDCProfiler(void *addr);

 /**
 * @ingroup rt_kernel
 * @brief launch kernel with tiling data to device
 * @param [in] stubFunc   stub function
 * @param [in] blockDim   block dimentions
 * @param [in] argsInfo   argments info address for kernel function
 * @param [in] smDesc   shared memory description
 * @param [in] stm   associated stream
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtKernelLaunchWithTiling(const void *stubFunc, uint32_t blockDim,
    rtArgsWithTiling_t *argsInfo, rtSmDesc_t *smDesc, rtStream_t stm);

 /**
 * @ingroup rt_kernel
 * @brief launch kernel with handle and tiling data to device
 * @param [in] hdl   program
 * @param [in] kernelInfoExt   kernel Info extension. device function description or tiling key,
 *                             depending static shape or dynmaic shape.
 * @param [in] blockDim   block dimentions
 * @param [in] argsInfo   argments info address for kernel function
 * @param [in] smDesc   shared memory description
 * @param [in] stm   associated stream
 * @param [in] kernelInfo   kernel info
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtKernelLaunchWithHandleAndTiling(void *hdl, const void *kernelInfoExt, uint32_t blockDim,
    rtArgsWithTiling_t *argsInfo, rtSmDesc_t *smDesc, rtStream_t stm, const void* kernelInfo);

 #if defined(__cplusplus)
 }
 #endif
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -30,6 +30,7 @@ extern "C" {
 #define RT_MEMORY_DDR_NC (0x20U)   // DDR memory of non-cache
 #define RT_MEMORY_TS (0x40U)       // Used for Ts memory
 #define RT_MEMORY_TS_4G (0x40U)    // Used for Ts memory(only 1951)
 #define RT_MEMORY_HOST (0x81U)     // Memory on host
 #define RT_MEMORY_RESERVED (0x100U)

 #define RT_MEMORY_L1 (0x1U << 16U)
@@ -57,6 +58,14 @@ extern "C" {
 #define RT_MEMORY_POLICY_HUGE_PAGE_ONLY_P2P (0x4000U)     // Malloc mem only use huge page, use for p2p, 0x1U << 14U
 #define RT_MEMORY_POLICY_DEFAULT_PAGE_ONLY_P2P (0x8000U)  // Malloc mem only use default page, use for p2p, 0x1U << 15U

 /**
 * @ingroup dvrt_mem
 * @brief memory attribute
 */
 #define RT_MEMORY_ATTRIBUTE_DEFAULT (0x0U)
 // memory read only attribute, now only dvpp memory support.
 #define RT_MEMORY_ATTRIBUTE_READONLY (0x100000U)    // Malloc readonly, 1<<20.

 #define MEM_ALLOC_TYPE_BIT (0x3FFU)  // mem type bit in <0, 9>

 /**
@@ -230,6 +239,18 @@ RTS_API rtError_t rtFree(void *devPtr);
 */
 RTS_API rtError_t rtDvppMalloc(void **devPtr, uint64_t size);

 /**
 * @ingroup dvrt_mem
 * @brief alloc device memory for dvpp, support set flag
 * @param [in|out] devPtr   memory pointer
 * @param [in] size   memory size
 * @param [in] flag   mem flag, can use mem attribute set read only.
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 * @return others is error
 */
 RTS_API rtError_t rtDvppMallocWithFlag(void **devPtr, uint64_t size, uint32_t flag);

 /**
 * @ingroup dvrt_mem
 * @brief free device memory for dvpp
--- a/third_party/fwkacllib/inc/runtime/rt.h
+++ b/third_party/fwkacllib/inc/runtime/rt.h
@@ -11,7 +11,6 @@
 #include "config.h"
 #include "context.h"
 #include "dev.h"
 #include "dvfsprofile.h"
 #include "event.h"
 #include "kernel.h"
 #include "mem.h"
--- a/third_party/fwkacllib/inc/runtime/rt_ffts_plus.h
+++ b/third_party/fwkacllib/inc/runtime/rt_ffts_plus.h
@@ -36,4 +36,4 @@ RTS_API rtError_t rtFftsPlusTaskLaunchWithFlag(rtFftsPlusTaskInfo_t *fftsPlusTas
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 #endif // CCE_RUNTIME_RT_FFTS_PLUS_H
 #endif // CCE_RUNTIME_RT_FFTS_PLUS_H
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@@ -131,7 +131,7 @@ typedef struct tagAllKernelTaskInfo {
    uint16_t argsCount;
    uint16_t argsSize;
    uint16_t reserved;
    const void *kernelInfoExt;
    uint64_t tilingKey;
    void *handle;
    uint8_t *smDesc;
    const uint8_t *args;
--- a/third_party/fwkacllib/inc/runtime/stream.h
+++ b/third_party/fwkacllib/inc/runtime/stream.h
@@ -80,6 +80,17 @@ RTS_API rtError_t rtStreamDestroy(rtStream_t stm);
 */
 RTS_API rtError_t rtStreamWaitEvent(rtStream_t stm, rtEvent_t evt);

 /**
 * @ingroup dvrt_stream
 * @brief wait an recorded event for stream, used for 1951 pg1
 * @param [in] stm   the wait stream
 * @param [in] event   the event to wait
 * @param [in] timeout   timeout value for 1951 pg1
 * @return RT_ERROR_NONE for ok
 * @return RT_ERROR_INVALID_VALUE for error input
 */
 RTS_API rtError_t rtStreamWaitEventWithTimeout(rtStream_t stm, rtEvent_t evt, uint32_t timeout);

 /**
 * @ingroup dvrt_stream
 * @brief wait stream to be complete
--- a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
@@ -36,6 +36,7 @@
 #define PROF_SUBTASK_TIME           0x0000040000000ULL
 #define PROF_OP_DETAIL              0x0000080000000ULL

 #define PROF_AICPU_MODEL            0x4000000000000000ULL
 #define PROF_MODEL_LOAD             0x8000000000000000ULL

 #define PROF_TASK_TRACE             (PROF_MODEL_EXECUTE | PROF_RUNTIME_TRACE | PROF_TRAINING_TRACE | \
@@ -69,6 +70,7 @@
 #define PROF_SUBTASK_TIME_MASK           0x0000040000000ULL
 #define PROF_OP_DETAIL_MASK              0x0000080000000ULL

 #define PROF_AICPU_MODEL_MASK            0x4000000000000000ULL
 #define PROF_MODEL_LOAD_MASK             0x8000000000000000ULL

 #if (defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER))
--- a/third_party/fwkacllib/inc/toolchain/prof_callback.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h
@@ -83,17 +83,6 @@ enum MsprofReporterCallbackType {
    MSPROF_REPORTER_HASH                  // hash data to id
 };

 /**
 * @name  MsprofReporterCallback
 * @brief callback to start reporter/stop reporter/report date
 * @param moduleId  [IN] enum MsprofReporterModuleId
 * @param type      [IN] enum MsprofReporterCallbackType
 * @param data      [IN] callback data (nullptr on INTI/UNINIT)
 * @param len       [IN] callback data size (0 on INIT/UNINIT)
 * @return enum MsprofErrorCode
 */
 typedef int32_t (*MsprofReporterCallback)(uint32_t moduleId, uint32_t type, void *data, uint32_t len);

 #define MSPROF_OPTIONS_DEF_LEN_MAX (2048)

 /**
@@ -114,6 +103,7 @@ enum MsprofCtrlCallbackType {
    MSPROF_CTRL_INIT_ACL_JSON,              // start pro with acl.json
    MSPROF_CTRL_INIT_GE_OPTIONS,            // start profiling with ge env and options
    MSPROF_CTRL_FINALIZE,                   // stop profiling
    MSPROF_CTRL_INIT_HELPER,                // start profiling in helper device
    MSPROF_CTRL_INIT_DYNA = 0xFF,           // start profiling for dynamic profiling
 };

@@ -127,40 +117,88 @@ enum MsprofCommandHandleType {
 };

 /**
 * @name  MsprofCtrlCallback
 * @brief callback to start/stop profiling
 * @param type      [IN] enum MsprofCtrlCallbackType
 * @param data      [IN] callback data
 * @param len       [IN] callback data size
 * @return enum MsprofErrorCode
 * @brief   profiling command type
 */
 enum ProfCtrlType {
    PROF_CTRL_INVALID = 0,
    PROF_CTRL_SWITCH,
    PROF_CTRL_REPORTER,
    PROF_CTRL_STEPINFO,
    PROF_CTRL_BUTT
 };

 /**
 * @brief   Prof Chip ID
 */
 enum Prof_Chip_ID {
    PROF_CHIP_ID0 = 0
 };

 typedef int32_t (*MsprofCtrlCallback)(uint32_t type, void *data, uint32_t len);
 typedef int32_t (*MsprofReporterCallback)(uint32_t moduleId, uint32_t type, void *data, uint32_t len);

 /**
 * @name  MsprofSetDeviceCallback
 * @brief callback to notify set/reset device
 * @param devId     [IN] device id
 * @param isOpenDevice  [IN] true: set device, false: reset device
 * @brief  the struct of profiling set setp info
 */
 typedef void (*MsprofSetDeviceCallback)(uint32_t devId, bool isOpenDevice);
 typedef struct ProfStepInfoCmd {
    uint64_t index_id;
    uint16_t tag_id;
    void *stream;
 } ProfStepInfoCmd_t;

 /**
 * @name  ProfCommandHandle
 * @brief callback to start/stop profiling
 * @param type      [IN] enum call back type
 * @param data      [IN] callback data
 * @param len       [IN] callback data size
 * @return enum MsprofErrorCode
 */
 typedef int32_t (*ProfCommandHandle)(uint32_t type, void *data, uint32_t len);
 /*
 * @name  MsprofInit
 * @name  profInit
 * @brief Profiling module init
 * @param [in] dataType: profiling type: ACL Env/ACL Json/GE Option
 * @param [in] data: profiling switch data
 * @param [in] dataLen: Length of data
 * @return 0:SUCCESS, >0:FAILED
 */
 MSVP_PROF_API int32_t MsprofInit(uint32_t dataType, void *data, uint32_t dataLen);
 MSVP_PROF_API int32_t MsprofInit(uint32_t moduleId, void *data, uint32_t dataLen);
 /**
 * @name  profRegisterCallback
 * @brief register callback to profiling
 * @param moduleId  [IN] module Id
 * @param handle    [IN] the pointer of callback
 */
 MSVP_PROF_API int32_t MsprofRegisterCallback(uint32_t moduleId, ProfCommandHandle handle);
 /*
 * @name profReportData
 * @brief start reporter/stop reporter/report date
 * @param moduleId  [IN] enum profReporterModuleId
 * @param type      [IN] enum profReporterCallbackType
 * @param data      [IN] data (nullptr on INTI/UNINIT)
 * @param len       [IN] data size (0 on INIT/UNINIT)
 * @return enum MsprofErrorCod
 */
 MSVP_PROF_API int32_t MsprofReportData(uint32_t moduleId, uint32_t type, void* data, uint32_t len);

 MSVP_PROF_API int32_t MsprofSetDeviceIdByGeModelIdx(const uint32_t geModelIdx, const uint32_t deviceId);
 MSVP_PROF_API int32_t MsprofUnsetDeviceIdByGeModelIdx(const uint32_t geModelIdx, const uint32_t deviceId);
 /*
 * @name AscendCL
 * @name profFinalize
 * @brief Finishing Profiling
 * @param NULL
 * @return 0:SUCCESS, >0:FAILED
 */
 MSVP_PROF_API int32_t MsprofFinalize();
 /**
 * @name  profNotifySetDevice
 * @brief notify set/reset device
 * @param devId     [IN] device id
 * @param isOpenDevice  [IN] true: set device, false: reset device
 */
 MSVP_PROF_API int32_t MsprofNotifySetDevice(uint32_t chipId, uint32_t deviceId, bool isOpen);

 #ifdef __cplusplus
 }
 #endif
--- a/third_party/fwkacllib/inc/toolchain/prof_common.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_common.h
@@ -28,6 +28,7 @@ enum MsprofDataTag {
    MSPROF_RUNTIME_DATA_TAG_API = 40,   //runtime data tag, range: 40~59
    MSPROF_RUNTIME_DATA_TAG_TRACK = 41,
    MSPROF_AICPU_DATA_TAG = 60,         //aicpu data tag, range: 60~79
    MSPROF_AICPU_MODEL_TAG = 61,
    MSPROF_HCCL_DATA_TAG = 80,          //hccl data tag, range: 80~99
    MSPROF_DP_DATA_TAG = 100,           //dp data tag, range: 100~119
    MSPROF_MSPROFTX_DATA_TAG = 120,     //hccl data tag, range: 120~139
@@ -52,6 +53,16 @@ struct MsprofMixData {
    } data;
 };

 #define PATH_LEN_MAX 1023
 #define PARAM_LEN_MAX 4095
 struct MsprofCommandHandleParams {
    uint32_t pathLen;
    uint32_t storageLimit;  // MB
    uint32_t profDataLen;
    char path[PATH_LEN_MAX + 1];
    char profData[PARAM_LEN_MAX + 1];
 };

 /**
 * @brief profiling command info
 */
@@ -63,6 +74,7 @@ struct MsprofCommandHandle {
    uint32_t devIdList[MSPROF_MAX_DEV_NUM];
    uint32_t modelId;
    uint32_t type;
    struct MsprofCommandHandleParams params;
 };

 /**
@@ -136,7 +148,7 @@ struct MsprofGeProfInferData {
    uint8_t  reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_TASK_DATA_RESERVE_BYTES 16
 #define MSPROF_GE_TASK_DATA_RESERVE_BYTES 12
 #define MSPROF_GE_OP_TYPE_LEN 56
 enum MsprofGeTaskType {
    MSPROF_GE_TASK_TYPE_AI_CORE = 0,
@@ -169,6 +181,7 @@ struct MsprofGeProfTaskData {
    uint32_t streamId;
    uint32_t taskId;
    uint32_t threadId;
    uint32_t contextId;
    uint8_t  reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES];
 };

@@ -305,6 +318,19 @@ struct MsprofAicpuProfData {
    uint8_t  reserve[MSPROF_AICPU_DATA_RESERVE_BYTES];
 };

 struct MsprofAicpuModelProfData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_AICPU_MODEL_TAG;
    uint32_t rsv;   // Ensure 8-byte alignment
    uint64_t timeStamp;
    uint64_t indexId;
    uint32_t modelId;
    uint16_t tagId;
    uint16_t rsv1;
    uint64_t eventId;
    uint8_t  reserve[24];
 };

 /**
 * @brief struct of data reported by DP
 */
--- a/third_party/fwkacllib/inc/toolchain/slog.h
+++ b/third_party/fwkacllib/inc/toolchain/slog.h
@@ -206,6 +206,7 @@ enum {
    TUNE,
    HSS, /**< helper */
    FFTS,
    OP,
    INVLID_MOUDLE_ID
 };