sync code 0728

4 years ago · 3dfd2119c1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,7 +125,6 @@ else ()
                message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!")
            endif()
        endif()

        set(METADEF_DIR ${CMAKE_CURRENT_LIST_DIR}/metadef)
        set(PARSER_DIR ${CMAKE_CURRENT_LIST_DIR}/parser)
        set(GE_DEPEND_DIR ${CMAKE_CURRENT_LIST_DIR}/..)
@@ -158,6 +157,7 @@ else ()
    elseif(ENABLE_MS_TESTCASES)
        include(cmake/external_libs/protobuf_static.cmake)
        include(cmake/external_libs/protoc.cmake)
        include(cmake/external_libs/json.cmake)
        include(cmake/external_libs/securec.cmake)
        include(cmake/FindModule.cmake)
        include(cmake/intf_pub_linux.cmake)
@@ -175,5 +175,4 @@ else ()
    endif()

    add_subdirectory(ge)

 endif ()
--- a/cmake/external_libs/json.cmake
+++ b/cmake/external_libs/json.cmake
@@ -9,10 +9,6 @@ if (GE_PB_PKG)
    set(REQ_URL "${GE_PB_PKG}/libs/ge_nlohmann_json/include.zip")
    set(MD5 "0dc903888211db3a0f170304cd9f3a89")
    set(JSON_INCLUDE_DIR ${JSON_SRC_DIR})
 #elseif (ENABLE_GITEE)
 #    set(REQ_URL "https://gitee.com/mirrors/JSON-for-Modern-CPP/repository/archive/v3.6.1.zip")
 #    set(MD5 "5bda78ce308e6cfcf614dcf1d5ff27a7")
 #set(JSON_INCLUDE_DIR "${JSON_SRC_DIR}/include")
 else()
    set(REQ_URL "https://github.com/nlohmann/json/releases/download/v3.6.1/include.zip")
    set(MD5 "0dc903888211db3a0f170304cd9f3a89")
--- a/ge/ge_runtime/CMakeLists.txt
+++ b/ge/ge_runtime/CMakeLists.txt
@@ -16,6 +16,7 @@ set(GE_SRC_LIST
    "task/label_goto_task.cc"
    "task/label_set_task.cc"
    "task/label_switch_task.cc"
    "task/label_manager.cc"
 )

 add_library(ge_runtime SHARED ${GE_SRC_LIST})
--- a/ge/ge_runtime/task/hccl_task.cc
+++ b/ge/ge_runtime/task/hccl_task.cc
@@ -53,15 +53,7 @@ HcclTask::HcclTask(const ModelContext &model_context, const std::shared_ptr<Hccl
  }
 }

 HcclTask::~HcclTask() {
  if (workspace_mem_ != nullptr) {
    rtError_t rt_ret = rtFree(workspace_mem_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtFree workspace_mem_ failed! ret: 0x%X.", rt_ret);
    }
    workspace_mem_ = nullptr;
  }
 }
 HcclTask::~HcclTask() {}

 bool HcclTask::Distribute() {
  // Ops kernel info store
@@ -80,11 +72,7 @@ bool HcclTask::Distribute() {
  SetSecondaryStream();

  if (task_info_->workspace_size() > 0) {
    rtError_t rt_ret = rtMalloc(&workspace_mem_, task_info_->workspace_size(), RT_MEMORYINFO_HBM);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
      return false;
    }
    workspace_mem_ = task_info_->workspace_addr();
  }

  GELOGI("HcclTaskInfo Distribute Start. begin to call function LoadTask in hccl.");
--- a/ge/ge_runtime/task/label_goto_task.cc
+++ b/ge/ge_runtime/task/label_goto_task.cc
@@ -16,33 +16,46 @@

 #include "ge_runtime/task/label_goto_task.h"
 #include "ge_runtime/task/task_factory.h"
 #include "framework/common/util.h"

 namespace ge {
 namespace model_runner {
 LabelGotoTask::LabelGotoTask(const ModelContext &model_context, const std::shared_ptr<LabelGotoTaskInfo> &task_info)
    : TaskRepeater<LabelGotoTaskInfo>(model_context, task_info), task_info_(task_info) {
    : TaskRepeater<LabelGotoTaskInfo>(model_context, task_info),
      task_info_(task_info),
      stream_(nullptr),
      index_value_(nullptr) {
  if (task_info_ == nullptr) {
    GELOGW("task_info_ is null!");
    return;
  }
  auto stream_list = model_context.stream_list();
  auto label_list = model_context.label_list();
  rt_model_handle_ = model_context.rt_model_handle();
  uint32_t stream_id = task_info->stream_id();
  uint32_t label_id = task_info->label_id();
  label_id_ = task_info->label_id();
  GELOGI("Stream list size:%zu, stream id:%u.", stream_list.size(), stream_id);
  GELOGI("Label list size:%zu, label id:%u.", label_list.size(), label_id);
  if (stream_id >= stream_list.size() || label_id >= label_list.size()) {
  GELOGI("Label list size:%zu, label id:%u.", label_list.size(), label_id_);
  if (stream_id >= stream_list.size() || label_id_ >= label_list.size()) {
    GELOGW("Stream/Label id invalid.");
    return;
  }
  stream_ = stream_list[stream_id];
  label_ = label_list[label_id];
  label_manager_ = LabelManager::GetInstance();
  if (label_manager_ == nullptr) {
    GELOGW("Get label manager instance failed.");
    return;
  }
  label_info_ = label_manager_->GetLabelInfo(rt_model_handle_, {label_id_}, label_list);
 }

 LabelGotoTask::~LabelGotoTask() {
  GE_FREE_RT_LOG(label_info_);
  GE_FREE_RT_LOG(index_value_);
  if (index_value_ != nullptr) {
    rtError_t rt_ret = rtFree(index_value_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtFree index_value_ failed! ret: 0x%X.", rt_ret);
    }
    index_value_ = nullptr;
  }
 }

 bool LabelGotoTask::Distribute() {
@@ -94,21 +107,34 @@ bool LabelGotoTask::CheckParamValid() {
    return false;
  }

  if (label_ == nullptr) {
    GELOGE(PARAM_INVALID, "label is null!");
  if (label_info_ == nullptr) {
    GELOGE(PARAM_INVALID, "label info is null!");
    return false;
  }

  if (label_info_ != nullptr) {
    GELOGE(PARAM_INVALID, "label_info_ has dirty data.");
    return false;
  if (index_value_ == nullptr) {
    rtError_t rt_ret = rtMalloc(&index_value_, sizeof(uint64_t), RT_MEMORY_HBM);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
      return false;
    }

    uint64_t index = 0;
    rt_ret = rtMemcpy(index_value_, sizeof(uint64_t), &index, sizeof(index), RT_MEMCPY_HOST_TO_DEVICE);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
      return false;
    }
  }

  if (index_value_ != nullptr) {
    GELOGE(PARAM_INVALID, "index_value_ has dirty data.");
  void *label_info = label_info_->GetLabelInfo();
  rtError_t rt_ret = rtLabelSwitchByIndex(index_value_, 1, label_info, stream_);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
    return false;
  }

  GELOGI("DistributeTask end.");
  return true;
 }

--- a/ge/ge_runtime/task/label_goto_task.h
+++ b/ge/ge_runtime/task/label_goto_task.h
@@ -18,7 +18,11 @@
 #define GE_GE_RUNTIME_TASK_LABEL_GOTO_TASK_H_

 #include <memory>
 #include <vector>
 #include <map>
 #include <mutex>
 #include "ge_runtime/task/task.h"
 #include "ge_runtime/task/label_manager.h"

 namespace ge {
 namespace model_runner {
@@ -31,13 +35,13 @@ class LabelGotoTask : public TaskRepeater<LabelGotoTaskInfo> {
  bool Distribute() override;

 private:
  bool CheckParamValid();

  std::shared_ptr<LabelGotoTaskInfo> task_info_;
  void *stream_{nullptr};
  void *label_{nullptr};
  void *label_info_{nullptr};
  void *index_value_{nullptr};
  void *stream_;
  std::shared_ptr<LabelGuard> label_info_;
  void *index_value_;
  uint32_t label_id_;
  rtModel_t rt_model_handle_;
  std::shared_ptr<LabelManager> label_manager_;
 };
 }  // namespace model_runner
 }  // namespace ge
--- a/ge/ge_runtime/task/label_manager.cc
+++ b/ge/ge_runtime/task/label_manager.cc
@@ -0,0 +1,119 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "ge_runtime/task/label_manager.h"
 #include <algorithm>
 #include <string>
 #include "runtime/mem.h"
 #include "runtime/rt_model.h"
 #include "common/ge_inner_error_codes.h"
 #include "framework/common/debug/ge_log.h"

 namespace ge {
 namespace model_runner {
 std::weak_ptr<LabelManager> LabelManager::instance_;
 std::mutex LabelManager::instance_mutex_;

 template <class T>
 static std::string GetVectorString(const std::vector<T> &vec) {
  std::string ret;
  for (size_t i = 0; i < vec.size(); ++i) {
    if (i != 0) {
      ret.push_back(',');
    }
    ret += std::to_string(vec[i]);
  }
  return ret;
 }

 LabelGuard::~LabelGuard() {
  void *label_info = GetLabelInfo();
  if (label_info != nullptr) {
    rtError_t rt_ret = rtFree(label_info);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtFree label_info failed! ret: 0x%X.", rt_ret);
    }
  }
 }

 std::shared_ptr<LabelManager> LabelManager::GetInstance() {
  std::lock_guard<std::mutex> lock(instance_mutex_);
  auto instance = instance_.lock();
  if (instance != nullptr) {
    return instance;
  }

  instance = std::make_shared<LabelManager>();
  instance_ = instance;
  return instance;
 }

 std::shared_ptr<LabelGuard> LabelManager::GetLabelInfo(rtModel_t model, const std::vector<uint32_t> &label_ids,
                                                       const std::vector<void *> &all_label) {
  std::lock_guard<std::mutex> lock(model_info_mapping_mutex_);
  rtError_t rt_ret;
  auto model_iter = model_info_mapping_.find(model);
  if (model_iter == model_info_mapping_.end()) {
    model_info_mapping_.emplace(model, std::map<std::string, std::weak_ptr<LabelGuard>>());
    model_iter = model_info_mapping_.find(model);
  }

  std::string label_id_str = GetVectorString(label_ids);
  auto &label_map = model_iter->second;
  auto label_iter = label_map.find(label_id_str);
  if (label_iter != label_map.end()) {
    auto label_guard = label_iter->second.lock();
    if (label_guard != nullptr) {
      GELOGI("model %p find same label id %s.", model, label_id_str.c_str());
      return label_guard;
    }
  }

  GELOGI("Alloc label id %s for model %p.", label_id_str.c_str(), model);
  void *label_info;
  std::vector<void *> label_list;
  bool status = true;
  std::transform(label_ids.begin(), label_ids.end(), std::back_inserter(label_list),
                 [&all_label, &status](uint32_t idx) -> void * {
                   if (idx >= all_label.size()) {
                     GELOGE(PARAM_INVALID, "Invalid label id %u, all label list size %zu.", idx, all_label.size());
                     status = false;
                     return nullptr;
                   }
                   return all_label[idx];
                 });
  if (!status) {
    GELOGE(PARAM_INVALID, "Get label info failed.");
    return nullptr;
  }
  uint32_t label_info_size = sizeof(rtLabelDevInfo) * label_list.size();
  rt_ret = rtMalloc(&label_info, label_info_size, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
    return nullptr;
  }

  rt_ret = rtLabelListCpy(label_list.data(), label_list.size(), label_info, label_info_size);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
    return nullptr;
  }

  auto label_guard = std::make_shared<LabelGuard>(label_info);
  label_map.emplace(label_id_str, label_guard);
  return label_guard;
 }
 }  // namespace model_runner
 }  // namespace ge
--- a/ge/ge_runtime/task/label_manager.h
+++ b/ge/ge_runtime/task/label_manager.h
@@ -0,0 +1,54 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef GE_GE_RUNTIME_TASK_LABEL_MANAGER_H_
 #define GE_GE_RUNTIME_TASK_LABEL_MANAGER_H_

 #include <vector>
 #include <memory>
 #include <mutex>
 #include <map>
 #include <runtime/base.h>

 namespace ge {
 namespace model_runner {
 class LabelGuard {
 public:
  explicit LabelGuard(void *label_info) : label_info_(reinterpret_cast<uintptr_t>(label_info)) {}
  ~LabelGuard();
  void *GetLabelInfo() { return reinterpret_cast<void *>(label_info_); }

 private:
  uintptr_t label_info_;
 };

 class LabelManager {
 public:
  static std::shared_ptr<LabelManager> GetInstance();
  std::shared_ptr<LabelGuard> GetLabelInfo(rtModel_t model, const std::vector<uint32_t> &label_ids,
                                           const std::vector<void *> &all_label);

 private:
  std::mutex model_info_mapping_mutex_;
  std::map<rtModel_t, std::map<std::string, std::weak_ptr<LabelGuard>>> model_info_mapping_;

  static std::weak_ptr<LabelManager> instance_;
  static std::mutex instance_mutex_;
 };


 }  // namespace model_runner
 }  // namespace ge
 #endif  // GE_GE_RUNTIME_TASK_LABEL_MANAGER_H_
--- a/ge/ge_runtime/task/label_switch_task.cc
+++ b/ge/ge_runtime/task/label_switch_task.cc
@@ -24,14 +24,14 @@ LabelSwitchTask::LabelSwitchTask(const ModelContext &model_context,
    : TaskRepeater<LabelSwitchTaskInfo>(model_context, task_info),
      task_info_(task_info),
      stream_(nullptr),
      all_label_resource_(),
      label_info_(nullptr) {
  if (task_info_ == nullptr) {
    GELOGW("task_info_ is null!");
    return;
  }

  all_label_resource_ = model_context.label_list();
  rt_model_handle_ = model_context.rt_model_handle();
  auto all_label_resource = model_context.label_list();
  auto stream_list = model_context.stream_list();
  uint32_t stream_id = task_info->stream_id();
  GELOGI("Stream list size:%zu, stream id:%u.", stream_list.size(), stream_id);
@@ -40,18 +40,16 @@ LabelSwitchTask::LabelSwitchTask(const ModelContext &model_context,
    return;
  }
  stream_ = stream_list[stream_id];
 }

 LabelSwitchTask::~LabelSwitchTask() {
  if (label_info_ != nullptr) {
    rtError_t rt_ret = rtFree(label_info_);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "rtFree fwkOpBuf failed! ret: 0x%X.", rt_ret);
    }
    label_info_ = nullptr;
  label_manager_ = LabelManager::GetInstance();
  if (label_manager_ == nullptr) {
    GELOGW("Get label manager instance failed.");
    return;
  }
  label_info_ = label_manager_->GetLabelInfo(rt_model_handle_, task_info_->label_list(), all_label_resource);
 }

 LabelSwitchTask::~LabelSwitchTask() {}

 bool LabelSwitchTask::Distribute() {
  GELOGI("LabelSwitchTask Distribute start.");
  if (!CheckParamValid()) {
@@ -117,8 +115,8 @@ bool LabelSwitchTask::CheckParamValid() {
    return false;
  }

  if (label_info_ != nullptr) {
    GELOGE(PARAM_INVALID, "label_info_ has dirty data.");
  if (label_info_ == nullptr) {
    GELOGE(PARAM_INVALID, "CopyLabelList failed, label info is null.");
    return false;
  }

@@ -126,6 +124,5 @@ bool LabelSwitchTask::CheckParamValid() {
 }

 REGISTER_TASK(TaskInfoType::LABEL_SWITCH, LabelSwitchTask, LabelSwitchTaskInfo);

 }  // namespace model_runner
 }  // namespace ge
--- a/ge/ge_runtime/task/label_switch_task.h
+++ b/ge/ge_runtime/task/label_switch_task.h
@@ -19,6 +19,7 @@

 #include <memory>
 #include "ge_runtime/task/task.h"
 #include "ge_runtime/task/label_manager.h"

 namespace ge {
 namespace model_runner {
@@ -35,8 +36,9 @@ class LabelSwitchTask : public TaskRepeater<LabelSwitchTaskInfo> {

  std::shared_ptr<LabelSwitchTaskInfo> task_info_;
  void *stream_;
  std::vector<void *> all_label_resource_;
  void *label_info_;
  rtModel_t rt_model_handle_;
  std::shared_ptr<LabelGuard> label_info_;
  std::shared_ptr<LabelManager> label_manager_;
 };
 }  // namespace model_runner
 }  // namespace ge
--- a/inc/external/acl/acl.h
+++ b/inc/external/acl/acl.h
@@ -0,0 +1,82 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_ACL_H_
 #define INC_EXTERNAL_ACL_ACL_H_

 #include "acl_rt.h"
 #include "acl_op.h"
 #include "acl_mdl.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 // Current version is 1.0.0
 #define ACL_MAJOR_VERSION 1
 #define ACL_MINOR_VERSION 0
 #define ACL_PATCH_VERSION 0

 /**
 * @ingroup AscendCL
 * @brief acl initialize
 *
 * @par Restriction
 * The aclInit interface can be called only once in a process
 * @param configPath [IN]    the config path,it can be NULL
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclInit(const char *configPath);

 /**
 * @ingroup AscendCL
 * @brief acl finalize
 *
 * @par Restriction
 * Need to call aclFinalize before the process exits.
 * After calling aclFinalize,the services cannot continue to be used normally.
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclFinalize();

 /**
 * @ingroup AscendCL
 * @brief query ACL interface version
 *
 * @param majorVersion[OUT] ACL interface major version
 * @param minorVersion[OUT] ACL interface minor version
 * @param patchVersion[OUT] ACL interface patch version
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetVersion(int32_t *majorVersion, int32_t *minorVersion, int32_t *patchVersion);

 /**
 * @ingroup AscendCL
 * @brief get recent error message
 *
 * @retval null for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY const char *aclGetRecentErrMsg();

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_ACL_H_
--- a/inc/external/acl/acl_base.h
+++ b/inc/external/acl/acl_base.h
@@ -0,0 +1,638 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_ACL_BASE_H_
 #define INC_EXTERNAL_ACL_ACL_BASE_H_

 #include <stdint.h>
 #include <stddef.h>
 #include "error_codes/rt_error_codes.h"
 #include "error_codes/ge_error_codes.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #if defined(_MSC_VER)
 #ifdef FUNC_VISIBILITY
 #define ACL_FUNC_VISIBILITY _declspec(dllexport)
 #else
 #define ACL_FUNC_VISIBILITY
 #endif
 #else
 #ifdef FUNC_VISIBILITY
 #define ACL_FUNC_VISIBILITY __attribute__((visibility("default")))
 #else
 #define ACL_FUNC_VISIBILITY
 #endif
 #endif

 #ifdef __GNUC__
 #define ACL_DEPRECATED __attribute__((deprecated))
 #define ACL_DEPRECATED_MESSAGE(message) __attribute__((deprecated(message)))
 #elif defined(_MSC_VER)
 #define ACL_DEPRECATED __declspec(deprecated)
 #define ACL_DEPRECATED_MESSAGE(message) __declspec(deprecated(message))
 #else
 #define ACL_DEPRECATED
 #define ACL_DEPRECATED_MESSAGE(message)
 #endif

 typedef void *aclrtStream;
 typedef void *aclrtEvent;
 typedef void *aclrtContext;
 typedef int aclError;
 typedef uint16_t aclFloat16;
 typedef struct aclDataBuffer aclDataBuffer;
 typedef struct aclTensorDesc aclTensorDesc;

 static const int ACL_ERROR_NONE = 0;
 static const int ACL_SUCCESS = 0;

 static const int ACL_ERROR_INVALID_PARAM = 100000;
 static const int ACL_ERROR_UNINITIALIZE = 100001;
 static const int ACL_ERROR_REPEAT_INITIALIZE = 100002;
 static const int ACL_ERROR_INVALID_FILE = 100003;
 static const int ACL_ERROR_WRITE_FILE = 100004;
 static const int ACL_ERROR_INVALID_FILE_SIZE = 100005;
 static const int ACL_ERROR_PARSE_FILE = 100006;
 static const int ACL_ERROR_FILE_MISSING_ATTR = 100007;
 static const int ACL_ERROR_FILE_ATTR_INVALID = 100008;
 static const int ACL_ERROR_INVALID_DUMP_CONFIG = 100009;
 static const int ACL_ERROR_INVALID_PROFILING_CONFIG = 100010;
 static const int ACL_ERROR_INVALID_MODEL_ID = 100011;
 static const int ACL_ERROR_DESERIALIZE_MODEL = 100012;
 static const int ACL_ERROR_PARSE_MODEL = 100013;
 static const int ACL_ERROR_READ_MODEL_FAILURE = 100014;
 static const int ACL_ERROR_MODEL_SIZE_INVALID = 100015;
 static const int ACL_ERROR_MODEL_MISSING_ATTR = 100016;
 static const int ACL_ERROR_MODEL_INPUT_NOT_MATCH = 100017;
 static const int ACL_ERROR_MODEL_OUTPUT_NOT_MATCH = 100018;
 static const int ACL_ERROR_MODEL_NOT_DYNAMIC = 100019;
 static const int ACL_ERROR_OP_TYPE_NOT_MATCH = 100020;
 static const int ACL_ERROR_OP_INPUT_NOT_MATCH = 100021;
 static const int ACL_ERROR_OP_OUTPUT_NOT_MATCH = 100022;
 static const int ACL_ERROR_OP_ATTR_NOT_MATCH = 100023;
 static const int ACL_ERROR_OP_NOT_FOUND = 100024;
 static const int ACL_ERROR_OP_LOAD_FAILED = 100025;
 static const int ACL_ERROR_UNSUPPORTED_DATA_TYPE = 100026;
 static const int ACL_ERROR_FORMAT_NOT_MATCH = 100027;
 static const int ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED = 100028;
 static const int ACL_ERROR_KERNEL_NOT_FOUND = 100029;
 static const int ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED = 100030;
 static const int ACL_ERROR_KERNEL_ALREADY_REGISTERED = 100031;
 static const int ACL_ERROR_INVALID_QUEUE_ID = 100032;
 static const int ACL_ERROR_REPEAT_SUBSCRIBE = 100033;
 static const int ACL_ERROR_STREAM_NOT_SUBSCRIBE = 100034;
 static const int ACL_ERROR_THREAD_NOT_SUBSCRIBE = 100035;
 static const int ACL_ERROR_WAIT_CALLBACK_TIMEOUT = 100036;
 static const int ACL_ERROR_REPEAT_FINALIZE = 100037;
 static const int ACL_ERROR_NOT_STATIC_AIPP = 100038;
 static const int ACL_ERROR_COMPILING_STUB_MODE = 100039;
 static const int ACL_ERROR_GROUP_NOT_SET = 100040;
 static const int ACL_ERROR_GROUP_NOT_CREATE = 100041;
 static const int ACL_ERROR_PROF_ALREADY_RUN = 100042;
 static const int ACL_ERROR_PROF_NOT_RUN = 100043;
 static const int ACL_ERROR_DUMP_ALREADY_RUN = 100044;
 static const int ACL_ERROR_DUMP_NOT_RUN = 100045;
 static const int ACL_ERROR_PROF_REPEAT_SUBSCRIBE = 148046;
 static const int ACL_ERROR_PROF_API_CONFLICT = 148047;
 static const int ACL_ERROR_INVALID_MAX_OPQUEUE_NUM_CONFIG = 148048;
 static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
 static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;

 static const int ACL_ERROR_BAD_ALLOC = 200000;
 static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
 static const int ACL_ERROR_INVALID_DEVICE = 200002;
 static const int ACL_ERROR_MEMORY_ADDRESS_UNALIGNED = 200003;
 static const int ACL_ERROR_RESOURCE_NOT_MATCH = 200004;
 static const int ACL_ERROR_INVALID_RESOURCE_HANDLE = 200005;
 static const int ACL_ERROR_FEATURE_UNSUPPORTED = 200006;
 static const int ACL_ERROR_PROF_MODULES_UNSUPPORTED = 200007;

 static const int ACL_ERROR_STORAGE_OVER_LIMIT = 300000;

 static const int ACL_ERROR_INTERNAL_ERROR = 500000;
 static const int ACL_ERROR_FAILURE = 500001;
 static const int ACL_ERROR_GE_FAILURE = 500002;
 static const int ACL_ERROR_RT_FAILURE = 500003;
 static const int ACL_ERROR_DRV_FAILURE = 500004;
 static const int ACL_ERROR_PROFILING_FAILURE = 500005;

 #define ACL_TENSOR_SHAPE_RANGE_NUM 2
 #define ACL_UNKNOWN_RANK 0xFFFFFFFFFFFFFFFE

 typedef enum {
  ACL_DT_UNDEFINED = -1,
  ACL_FLOAT = 0,
  ACL_FLOAT16 = 1,
  ACL_INT8 = 2,
  ACL_INT32 = 3,
  ACL_UINT8 = 4,
  ACL_INT16 = 6,
  ACL_UINT16 = 7,
  ACL_UINT32 = 8,
  ACL_INT64 = 9,
  ACL_UINT64 = 10,
  ACL_DOUBLE = 11,
  ACL_BOOL = 12,
  ACL_STRING = 13,
 } aclDataType;

 typedef enum {
  ACL_FORMAT_UNDEFINED = -1,
  ACL_FORMAT_NCHW = 0,
  ACL_FORMAT_NHWC = 1,
  ACL_FORMAT_ND = 2,
  ACL_FORMAT_NC1HWC0 = 3,
  ACL_FORMAT_FRACTAL_Z = 4,
  ACL_FORMAT_NC1HWC0_C04 = 12,
  ACL_FORMAT_NDHWC = 27,
  ACL_FORMAT_FRACTAL_NZ = 29,
  ACL_FORMAT_NCDHW = 30,
  ACL_FORMAT_NDC1HWC0 = 32,
  ACL_FRACTAL_Z_3D = 33
 } aclFormat;

 typedef enum {
  ACL_DEBUG = 0,
  ACL_INFO = 1,
  ACL_WARNING = 2,
  ACL_ERROR = 3,
 } aclLogLevel;

 typedef enum {
  ACL_MEMTYPE_DEVICE = 0,
  ACL_MEMTYPE_HOST = 1,
 } aclMemType;

 /**
 * @ingroup AscendCL
 * @brief Converts data of type aclFloat16 to data of type float
 *
 * @param value [IN]   Data to be converted
 *
 * @retval Transformed data
 */
 ACL_FUNC_VISIBILITY float aclFloat16ToFloat(aclFloat16 value);

 /**
 * @ingroup AscendCL
 * @brief Converts data of type float to data of type aclFloat16
 *
 * @param value [IN]   Data to be converted
 *
 * @retval Transformed data
 */
 ACL_FUNC_VISIBILITY aclFloat16 aclFloatToFloat16(float value);

 /**
 * @ingroup AscendCL
 * @brief create data of aclDataBuffer
 *
 * @param data [IN]    pointer to data
 * @li Need to be managed by the user,
 *  call aclrtMalloc interface to apply for memory,
 *  call aclrtFree interface to release memory
 *
 * @param size [IN]    size of data in bytes
 *
 * @retval pointer to created instance. nullptr if run out of memory
 *
 * @see aclrtMalloc | aclrtFree
 */
 ACL_FUNC_VISIBILITY aclDataBuffer *aclCreateDataBuffer(void *data, size_t size);

 /**
 * @ingroup AscendCL
 * @brief destroy data of aclDataBuffer
 *
 * @par Function
 *  Only the aclDataBuffer type data is destroyed here.
 *  The memory of the data passed in when the aclDataDataBuffer interface
 *  is called to create aclDataBuffer type data must be released by the user
 *
 * @param  dataBuffer [IN]   pointer to the aclDataBuffer
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclCreateDataBuffer
 */
 ACL_FUNC_VISIBILITY aclError aclDestroyDataBuffer(const aclDataBuffer *dataBuffer);

 /**
 * @ingroup AscendCL
 * @brief update new data of aclDataBuffer
 *
 * @param dataBuffer [OUT]    pointer to aclDataBuffer
 * @li The old data need to be released by the user, otherwise it may occur memory leak leakage
 *  call aclGetDataBufferAddr interface to get old data address
 *  call aclrtFree interface to release memory
 *
 * @param data [IN]    pointer to new data
 * @li Need to be managed by the user,
 *  call aclrtMalloc interface to apply for memory,
 *  call aclrtFree interface to release memory
 *
 * @param size [IN]    size of data in bytes
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtMalloc | aclrtFree | aclGetDataBufferAddr
 */
 ACL_FUNC_VISIBILITY aclError aclUpdateDataBuffer(aclDataBuffer *dataBuffer, void *data, size_t size);

 /**
 * @ingroup AscendCL
 * @brief get data address from aclDataBuffer
 *
 * @param dataBuffer [IN]    pointer to the data of aclDataBuffer
 *
 * @retval data address
 */
 ACL_FUNC_VISIBILITY void *aclGetDataBufferAddr(const aclDataBuffer *dataBuffer);

 /**
 * @ingroup AscendCL
 * @brief get data size of aclDataBuffer
 *
 * @param  dataBuffer [IN]    pointer to the data of aclDataBuffer
 *
 * @retval data size
 */
 ACL_DEPRECATED_MESSAGE("aclGetDataBufferSize is deprecated, use aclGetDataBufferSizeV2 instead")
 ACL_FUNC_VISIBILITY uint32_t aclGetDataBufferSize(const aclDataBuffer *dataBuffer);

 /**
 * @ingroup AscendCL
 * @brief get data size of aclDataBuffer to replace aclGetDataBufferSize
 *
 * @param  dataBuffer [IN]    pointer to the data of aclDataBuffer
 *
 * @retval data size
 */
 ACL_FUNC_VISIBILITY size_t aclGetDataBufferSizeV2(const aclDataBuffer *dataBuffer);

 /**
 * @ingroup AscendCL
 * @brief get size of aclDataType
 *
 * @param  dataType [IN]    aclDataType data the size to get
 *
 * @retval size of the aclDataType
 */
 ACL_FUNC_VISIBILITY size_t aclDataTypeSize(aclDataType dataType);

 // interfaces of tensor desc
 /**
 * @ingroup AscendCL
 * @brief create data aclTensorDesc
 *
 * @param  dataType [IN]    Data types described by tensor
 * @param  numDims [IN]     the number of dimensions of the shape
 * @param  dims [IN]        the size of the specified dimension
 * @param  format [IN]      tensor format
 *
 * @retval aclTensorDesc pointer.
 * @retval nullptr if param is invalid or run out of memory
 */
 ACL_FUNC_VISIBILITY aclTensorDesc *aclCreateTensorDesc(aclDataType dataType, int numDims, const int64_t *dims,
                                                       aclFormat format);

 /**
 * @ingroup AscendCL
 * @brief destroy data aclTensorDesc
 *
 * @param desc [IN]     pointer to the data of aclTensorDesc to destroy
 */
 ACL_FUNC_VISIBILITY void aclDestroyTensorDesc(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief set tensor shape range for aclTensorDesc
 *
 * @param  desc [OUT]     pointer to the data of aclTensorDesc
 * @param  dimsCount [IN]     the number of dimensions of the shape
 * @param  dimsRange [IN]     the range of dimensions of the shape
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorShapeRange(aclTensorDesc *desc, size_t dimsCount,
                                                    int64_t dimsRange[][ACL_TENSOR_SHAPE_RANGE_NUM]);

 /**
 * @ingroup AscendCL
 * @brief get data type specified by the tensor description
 *
 * @param desc [IN]        pointer to the instance of aclTensorDesc
 *
 * @retval data type specified by the tensor description.
 * @retval ACL_DT_UNDEFINED if description is null
 */
 ACL_FUNC_VISIBILITY aclDataType aclGetTensorDescType(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief get data format specified by the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 *
 * @retval data format specified by the tensor description.
 * @retval ACL_FORMAT_UNDEFINED if description is null
 */
 ACL_FUNC_VISIBILITY aclFormat aclGetTensorDescFormat(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief get tensor size specified by the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 *
 * @retval data size specified by the tensor description.
 * @retval 0 if description is null
 */
 ACL_FUNC_VISIBILITY size_t aclGetTensorDescSize(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief get element count specified by the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 *
 * @retval element count specified by the tensor description.
 * @retval 0 if description is null
 */
 ACL_FUNC_VISIBILITY size_t aclGetTensorDescElementCount(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief get number of dims specified by the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 *
 * @retval number of dims specified by the tensor description.
 * @retval 0 if description is null
 * @retval ACL_UNKNOWN_RANK if the tensor dim is -2
 */
 ACL_FUNC_VISIBILITY size_t aclGetTensorDescNumDims(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief Get the size of the specified dim in the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 * @param  index [IN]       index of dims, start from 0.
 *
 * @retval dim specified by the tensor description and index.
 * @retval -1 if description or index is invalid
 */
 ACL_DEPRECATED_MESSAGE("aclGetTensorDescDim is deprecated, use aclGetTensorDescDimV2 instead")
 ACL_FUNC_VISIBILITY int64_t aclGetTensorDescDim(const aclTensorDesc *desc, size_t index);

 /**
 * @ingroup AscendCL
 * @brief Get the size of the specified dim in the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 * @param  index [IN]       index of dims, start from 0.
 * @param  dimSize [OUT]    size of the specified dim.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclGetTensorDescDimV2(const aclTensorDesc *desc, size_t index, int64_t *dimSize);

 /**
 * @ingroup AscendCL
 * @brief Get the range of the specified dim in the tensor description
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 * @param  index [IN]       index of dims, start from 0.
 * @param  dimRangeNum [IN]     number of dimRange.
 * @param  dimRange [OUT]       range of the specified dim.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclGetTensorDescDimRange(const aclTensorDesc *desc, size_t index, size_t dimRangeNum,
                                                      int64_t *dimRange);

 /**
 * @ingroup AscendCL
 * @brief set tensor description name
 *
 * @param desc [OUT]       pointer to the instance of aclTensorDesc
 * @param name [IN]        tensor description name
 */
 ACL_FUNC_VISIBILITY void aclSetTensorDescName(aclTensorDesc *desc, const char *name);

 /**
 * @ingroup AscendCL
 * @brief get tensor description name
 *
 * @param  desc [IN]        pointer to the instance of aclTensorDesc
 *
 * @retval tensor description name.
 * @retval empty string if description is null
 */
 ACL_FUNC_VISIBILITY const char *aclGetTensorDescName(aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief Convert the format in the source aclTensorDesc according to
 * the specified dstFormat to generate a new target aclTensorDesc.
 * The format in the source aclTensorDesc remains unchanged.
 *
 * @param  srcDesc [IN]     pointer to the source tensor desc
 * @param  dstFormat [IN]   destination format
 * @param  dstDesc [OUT]    pointer to the pointer to the destination tensor desc
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclTransTensorDescFormat(const aclTensorDesc *srcDesc, aclFormat dstFormat,
                                                      aclTensorDesc **dstDesc);

 /**
 * @ingroup AscendCL
 * @brief Set the storage format specified by the tensor description
 *
 * @param  desc [OUT]     pointer to the instance of aclTensorDesc
 * @param  format [IN]    the storage format
 *
 * @retval ACL_SUCCESS    The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_DEPRECATED_MESSAGE("aclSetTensorStorageFormat is deprecated, use aclSetTensorFormat instead")
 ACL_FUNC_VISIBILITY aclError aclSetTensorStorageFormat(aclTensorDesc *desc, aclFormat format);

 /**
 * @ingroup AscendCL
 * @brief Set the storage shape specified by the tensor description
 *
 * @param  desc [OUT]      pointer to the instance of aclTensorDesc
 * @param  numDims [IN]    the number of dimensions of the shape
 * @param  dims [IN]       the size of the specified dimension
 *
 * @retval ACL_SUCCESS     The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_DEPRECATED_MESSAGE("aclSetTensorStorageShape is deprecated, use aclSetTensorShape instead")
 ACL_FUNC_VISIBILITY aclError aclSetTensorStorageShape(aclTensorDesc *desc, int numDims, const int64_t *dims);

 /**
 * @ingroup AscendCL
 * @brief Set the format specified by the tensor description
 *
 * @param  desc [OUT]     pointer to the instance of aclTensorDesc
 * @param  format [IN]    the storage format
 *
 * @retval ACL_SUCCESS    The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorFormat(aclTensorDesc *desc, aclFormat format);

 /**
 * @ingroup AscendCL
 * @brief Set the shape specified by the tensor description
 *
 * @param  desc [OUT]      pointer to the instance of aclTensorDesc
 * @param  numDims [IN]    the number of dimensions of the shape
 * @param  dims [IN]       the size of the specified dimension
 *
 * @retval ACL_SUCCESS     The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorShape(aclTensorDesc *desc, int numDims, const int64_t *dims);

 /**
 * @ingroup AscendCL
 * @brief Set the original format specified by the tensor description
 *
 * @param  desc [OUT]     pointer to the instance of aclTensorDesc
 * @param  format [IN]    the storage format
 *
 * @retval ACL_SUCCESS    The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorOriginFormat(aclTensorDesc *desc, aclFormat format);

 /**
 * @ingroup AscendCL
 * @brief Set the original shape specified by the tensor description
 *
 * @param  desc [OUT]      pointer to the instance of aclTensorDesc
 * @param  numDims [IN]    the number of dimensions of the shape
 * @param  dims [IN]       the size of the specified dimension
 *
 * @retval ACL_SUCCESS     The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorOriginShape(aclTensorDesc *desc, int numDims, const int64_t *dims);

 /**
 * @ingroup AscendCL
 * @brief get op description info
 *
 * @param desc [IN]     pointer to tensor description
 * @param index [IN]    index of tensor
 *
 * @retval null for failed.
 * @retval OtherValues success.
 */
 ACL_FUNC_VISIBILITY aclTensorDesc *aclGetTensorDescByIndex(aclTensorDesc *desc, size_t index);

 /**
 * @ingroup AscendCL
 * @brief get address of tensor
 *
 * @param desc [IN]    pointer to tensor description
 *
 * @retval null for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY void *aclGetTensorDescAddress(const aclTensorDesc *desc);

 /**
 * @ingroup AscendCL
 * @brief Set the dynamic input name specified by the tensor description
 *
 * @param  desc [OUT]      pointer to the instance of aclTensorDesc
 * @param  dynamicInputName [IN]       pointer to the dynamic input name
 *
 * @retval ACL_SUCCESS     The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorDynamicInput(aclTensorDesc *desc, const char *dynamicInputName);

 /**
 * @ingroup AscendCL
 * @brief Set const data specified by the tensor description
 *
 * @param  desc [OUT]      pointer to the instance of aclTensorDesc
 * @param  dataBuffer [IN]       pointer to the const databuffer
 * @param  length [IN]       the length of const databuffer
 *
 * @retval ACL_SUCCESS     The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorConst(aclTensorDesc *desc, void *dataBuffer, size_t length);

 /**
 * @ingroup AscendCL
 * @brief Set tensor memory type specified by the tensor description
 *
 * @param  desc [OUT]      pointer to the instance of aclTensorDesc
 * @param  memType [IN]       ACL_MEMTYPE_DEVICE means device, ACL_MEMTYPE_HOST means host
 *
 * @retval ACL_SUCCESS     The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetTensorPlaceMent(aclTensorDesc *desc, aclMemType memType);

 /**
 * @ingroup AscendCL
 * @brief an interface for users to output  APP logs
 *
 * @param logLevel [IN]    the level of current log
 * @param func [IN]        the function where the log is located
 * @param file [IN]        the file where the log is located
 * @param line [IN]        Number of source lines where the log is located
 * @param fmt [IN]         the format of current log
 * @param ... [IN]         the value of current log
 */
 ACL_FUNC_VISIBILITY void aclAppLog(aclLogLevel logLevel, const char *func, const char *file, uint32_t line,
                                   const char *fmt, ...);

 /**
 * @ingroup AscendCL
 * @brief get soc name
 *
 * @retval null for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY const char *aclrtGetSocName();

 #define ACL_APP_LOG(level, fmt, ...) aclAppLog(level, __FUNCTION__, __FILE__, __LINE__, fmt, ##__VA_ARGS__)

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_ACL_BASE_H_
--- a/inc/external/acl/acl_mdl.h
+++ b/inc/external/acl/acl_mdl.h
--- a/inc/external/acl/acl_op.h
+++ b/inc/external/acl/acl_op.h
@@ -0,0 +1,504 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_ACL_OP_H_
 #define INC_EXTERNAL_ACL_ACL_OP_H_

 #include "acl_base.h"
 #include "acl_rt.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef struct aclopHandle aclopHandle;
 typedef struct aclopAttr aclopAttr;
 typedef struct aclopKernelDesc aclopKernelDesc;

 typedef void (*aclDataDeallocator)(void *data, size_t length);

 static const int ACL_COMPILE_FLAG_BIN_SELECTOR = 1;

 typedef enum aclEngineType {
  ACL_ENGINE_SYS,
  ACL_ENGINE_AICORE,
  ACL_ENGINE_VECTOR,
 } aclopEngineType;

 /**
 * @ingroup AscendCL
 * @brief Set base directory that contains single op models
 *
 * @par Restriction
 * The aclopSetModelDir interface can be called only once in a process.
 * @param  modelDir [IN]   path of the directory
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetModelDir(const char *modelDir);

 /**
 * @ingroup AscendCL
 * @brief load single op models from memory
 *
 * @par Restriction
 * The aclopLoad interface can be called more than one times in a process.
 * @param model [IN]        address of single op models
 * @param modelSize [IN]    size of single op models
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopLoad(const void *model, size_t modelSize);

 /**
 * @ingroup AscendCL
 * @brief create data of type aclopAttr
 *
 * @retval pointer to created instance.
 * @retval nullptr if run out of memory
 */
 ACL_FUNC_VISIBILITY aclopAttr *aclopCreateAttr();

 /**
 * @ingroup AscendCL
 * @brief destroy data of typ aclopAttr
 *
 * @param attr [IN]   pointer to the instance of aclopAttr
 */
 ACL_FUNC_VISIBILITY void aclopDestroyAttr(const aclopAttr *attr);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is bool
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param attrValue [IN]   attribute value
 *                         false if attrValue is 0, true otherwise.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrBool(aclopAttr *attr, const char *attrName, uint8_t attrValue);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is int64_t
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param attrValue [IN]   attribute value
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrInt(aclopAttr *attr, const char *attrName, int64_t attrValue);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is float
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param attrValue [IN]   attribute value
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrFloat(aclopAttr *attr, const char *attrName, float attrValue);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is string
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param attrValue [IN]   attribute value
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrString(aclopAttr *attr, const char *attrName, const char *attrValue);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is list of bools
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param numValues [IN]   number of values. false if attrValue is 0, true otherwise.
 * @param values [IN]      pointer to values
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrListBool(aclopAttr *attr, const char *attrName, int numValues,
                                                  const uint8_t *values);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is list of ints
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param numValues [IN]   number of values
 * @param values [IN]      pointer to values
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrListInt(aclopAttr *attr, const char *attrName, int numValues,
                                                 const int64_t *values);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is list of floats
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param numValues [IN]   number of values
 * @param values [IN]      pointer to values
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrListFloat(aclopAttr *attr, const char *attrName, int numValues,
                                                   const float *values);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is list of strings
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param numValues [IN]   number of values
 * @param values [IN]      pointer to values
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrListString(aclopAttr *attr, const char *attrName, int numValues,
                                                    const char **values);

 /**
 * @ingroup AscendCL
 * @brief set an attribute. the type of the attribute is list of list of ints
 *
 * @param attr [OUT]       pointer to the instance of aclopAttr
 * @param attrName [IN]    attribute name
 * @param numLists [IN]    number of lists
 * @param numValues [IN]   pointer to number of values of each list
 * @param values [IN]      pointer to values
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetAttrListListInt(aclopAttr *attr, const char *attrName, int numLists,
                                                     const int *numValues, const int64_t *const values[]);

 /**
 * @ingroup AscendCL
 * @brief Load and execute the specified operator asynchronously
 *
 * @par Restriction
 * @li The input and output organization of each operator is different,
 * and the application needs to organize the operator strictly
 * according to the operator input and output parameters when calling.
 * @li When the user calls aclopExecute,
 * the ACL finds the corresponding task according to the optype,
 * the description of the input tesnsor,
 * the description of the output tesnsor, and attr, and issues the execution.
 *
 * @param opType [IN]      type of op
 * @param numInputs [IN]   number of inputs
 * @param inputDesc [IN]   pointer to array of input tensor descriptions
 * @param inputs [IN]      pointer to array of input buffers
 * @param numOutputs [IN]  number of outputs
 * @param outputDesc [IN]  pointer to array of output tensor descriptions
 * @param outputs [OUT]    pointer to array of output buffers
 * @param attr [IN]        pointer to instance of aclopAttr.
 *                         may pass nullptr if the op has no attribute
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_DEPRECATED_MESSAGE("aclopExecute is deprecated, use aclopExecuteV2 instead")
 ACL_FUNC_VISIBILITY aclError aclopExecute(const char *opType, int numInputs, const aclTensorDesc *const inputDesc[],
                                          const aclDataBuffer *const inputs[], int numOutputs,
                                          const aclTensorDesc *const outputDesc[], aclDataBuffer *const outputs[],
                                          const aclopAttr *attr, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief Load and execute the specified operator
 *        The difference with aclopExecute is that aclopExecuteV2 will refresh outputDesc
 *
 * @par Restriction
 * @li The input and output organization of each operator is different,
 * and the application needs to organize the operator strictly
 * according to the operator input and output parameters when calling.
 * @li When the user calls aclopExecuteV2,
 * the ACL finds the corresponding task according to the optype,
 * the description of the input tesnsor,
 * the description of the output tesnsor, and attr, and issues the execution.
 *
 * @param opType [IN]      type of op
 * @param numInputs [IN]   number of inputs
 * @param inputDesc [IN]   pointer to array of input tensor descriptions
 * @param inputs [IN]      pointer to array of input buffers
 * @param numOutputs [IN]  number of outputs
 * @param outputDesc [IN|OUT]  pointer to array of output tensor descriptions
 * @param outputs [OUT]    pointer to array of output buffers
 * @param attr [IN]        pointer to instance of aclopAttr.
 *                         may pass nullptr if the op has no attribute
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopExecuteV2(const char *opType, int numInputs, aclTensorDesc *inputDesc[],
                                            aclDataBuffer *inputs[], int numOutputs, aclTensorDesc *outputDesc[],
                                            aclDataBuffer *outputs[], aclopAttr *attr, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a instance of aclopHandle.
 *
 * @param opType [IN]      type of op
 * @param numInputs [IN]   number of inputs
 * @param inputDesc [IN]   pointer to array of input tensor descriptions
 * @param numOutputs [IN]  number of outputs
 * @param outputDesc [IN]  pointer to array of output tensor descriptions
 * @param opAttr [IN]      pointer to instance of aclopAttr.
 *                         may pass nullptr if the op has no attribute
 * @param handle [OUT]     pointer to the pointer to the handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopCreateHandle(const char *opType, int numInputs,
                                               const aclTensorDesc *const inputDesc[], int numOutputs,
                                               const aclTensorDesc *const outputDesc[], const aclopAttr *opAttr,
                                               aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief destroy aclopHandle instance
 *
 * @param handle [IN]   pointer to the instance of aclopHandle
 */
 ACL_FUNC_VISIBILITY void aclopDestroyHandle(aclopHandle *handle);

 /**
 * @ingroup AscendCL
 * @brief execute an op with the handle.
 *        can save op model matching cost compared with aclopExecute
 *
 * @param handle [IN]      pointer to the instance of aclopHandle.
 *                         The aclopCreateHandle interface has been called
 *                         in advance to create aclopHandle type data.
 * @param numInputs [IN]   number of inputs
 * @param inputs [IN]      pointer to array of input buffers.
 *                         The aclCreateDataBuffer interface has been called
 *                         in advance to create aclDataBuffer type data.
 * @param numOutputs [IN]  number of outputs
 * @param outputs [OUT]    pointer to array of output buffers
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclopCreateHandle | aclCreateDataBuffer
 */
 ACL_FUNC_VISIBILITY aclError aclopExecWithHandle(aclopHandle *handle, int numInputs,
                                                 const aclDataBuffer *const inputs[], int numOutputs,
                                                 aclDataBuffer *const outputs[], aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief cast data type
 *
 * @param srcDesc [IN]     source tensor desc
 * @param srcBuffer [IN]   source tensor buffer
 * @param dstDesc [IN]     destination tensor desc
 * @param dstBuffer [OUT]  destination tensor buffer
 * @param truncate [IN]    do not truncate if value is 0, truncate otherwise
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopCast(const aclTensorDesc *srcDesc, const aclDataBuffer *srcBuffer,
                                       const aclTensorDesc *dstDesc, aclDataBuffer *dstBuffer, uint8_t truncate,
                                       aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for casting datatype
 *
 * @param srcDesc [IN]    source tensor desc
 * @param dstDesc [IN]    destination tensor desc
 * @param truncate [IN]   do not truncate if value is 0, truncate otherwise
 * @param handle [OUT]    pointer to the pointer to the handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopCreateHandleForCast(aclTensorDesc *srcDesc, aclTensorDesc *dstDesc, uint8_t truncate,
                                                      aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief create kernel
 *
 * @param opType [IN]           op type
 * @param kernelId [IN]         kernel id
 * @param kernelName [IN]       kernel name
 * @param binData [IN]          kernel bin data
 * @param binSize [IN]          kernel bin size
 * @param enginetype [IN]       enigne type
 * @param deallocator [IN]      callback function for deallocating bin data,
 *                              null if bin data to be deallocated by caller
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclopCompile
 */
 ACL_FUNC_VISIBILITY aclError aclopCreateKernel(const char *opType, const char *kernelId, const char *kernelName,
                                               void *binData, int binSize, aclopEngineType enginetype,
                                               aclDataDeallocator deallocator);

 /**
 * @ingroup AscendCL
 * @brief create kernel
 *
 * @param numInputs [IN]            number of inputs
 * @param inputDesc [IN]            pointer to array of input tensor descriptions
 * @param numOutputs [IN]           number of outputs
 * @param outputDesc [IN]           pointer to array of output tensor descriptions
 * @param opAttr [IN]               pointer to instance of aclopAttr
 * @param aclopKernelDesc [IN]      pointer to instance of aclopKernelDesc
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 typedef aclError (*aclopCompileFunc)(int numInputs, const aclTensorDesc *const inputDesc[], int numOutputs,
                                     const aclTensorDesc *const outputDesc[], const aclopAttr *opAttr,
                                     aclopKernelDesc *aclopKernelDesc);

 /**
 * @ingroup AscendCL
 * @brief register compile function
 *
 * @param opType [IN]         op type
 * @param func [IN]           compile function
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclopUnregisterCompileFunc
 */
 ACL_FUNC_VISIBILITY aclError aclopRegisterCompileFunc(const char *opType, aclopCompileFunc func);

 /**
 * @ingroup AscendCL
 * @brief unregister compile function
 *
 * @param opType [IN]         op type
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopUnregisterCompileFunc(const char *opType);

 /**
 * @ingroup AscendCL
 * @brief set kernel args
 *
 * @param kernelDesc [IN]               pointer to instance of aclopKernelDesc
 * @param kernelId [IN]                 kernel id
 * @param blockDim [IN]                 block dim
 * @param args [IN]                     args
 * @param argSize [IN]                  size in bytes of args
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetKernelArgs(aclopKernelDesc *kernelDesc, const char *kernelId, uint32_t blockDim,
                                                const void *args, uint32_t argSize);

 /**
 * @ingroup AscendCL
 * @brief set workspace sizes
 *
 * @param kernelDesc [IN]               pointer to instance of aclopKernelDesc
 * @param numWorkspaces [IN]            number of workspaces
 * @param workspaceSizes [IN]           pointer to array of sizes of workspaces
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetKernelWorkspaceSizes(aclopKernelDesc *kernelDesc, int numWorkspaces,
                                                          size_t *workspaceSizes);

 /**
 * @ingroup AscendCL
 * @brief compile op with dynamic shape
 *
 * @param opType [IN]       op type
 * @param numInputs [IN]    number of inputs
 * @param inputDesc [IN]    pointer to array of input tensor descriptions
 * @param numOutputs [IN]   number of outputs
 * @param outputDesc [IN]   pointer to array of output tensor descriptions
 * @param attr [IN]         pointer to instance of aclopAttr.
 *                          may pass nullptr if the op has no attribute
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopUpdateParams(const char *opType, int numInputs,
                                               const aclTensorDesc *const inputDesc[], int numOutputs,
                                               const aclTensorDesc *const outputDesc[], const aclopAttr *attr);

 /**
 * @ingroup AscendCL
 * @brief inferShape the specified operator synchronously
 *
 * @param opType [IN]       type of op
 * @param numInputs [IN]    number of inputs
 * @param inputDesc [IN]    pointer to array of input tensor descriptions
 * @param inputs [IN]       pointer to array of input buffers
 * @param numOutputs [IN]   number of outputs
 * @param outputDesc [OUT]  pointer to array of output tensor descriptions
 * @param attr [IN]         pointer to instance of aclopAttr.
 *                          may pass nullptr if the op has no attribute
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopInferShape(const char *opType, int numInputs, aclTensorDesc *inputDesc[],
                                             aclDataBuffer *inputs[], int numOutputs, aclTensorDesc *outputDesc[],
                                             aclopAttr *attr);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_ACL_OP_H_
--- a/inc/external/acl/acl_op_compiler.h
+++ b/inc/external/acl/acl_op_compiler.h
@@ -0,0 +1,121 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_ACL_OP_COMPILER_H_
 #define INC_EXTERNAL_ACL_ACL_OP_COMPILER_H_

 #include "acl_base.h"
 #include "acl_op.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef enum aclCompileType { ACL_COMPILE_SYS, ACL_COMPILE_UNREGISTERED } aclopCompileType;

 typedef enum {
  ACL_PRECISION_MODE,
  ACL_AICORE_NUM,
  ACL_AUTO_TUNE_MODE,
  ACL_OP_SELECT_IMPL_MODE,
  ACL_OPTYPELIST_FOR_IMPLMODE,
  ACL_OP_DEBUG_LEVEL,
  ACL_DEBUG_DIR,
  ACL_OP_COMPILER_CACHE_MODE,
  ACL_OP_COMPILER_CACHE_DIR,
  ACL_OP_PERFORMANCE_MODE
 } aclCompileOpt;

 typedef enum aclCompileFlag { ACL_OP_COMPILE_DEFAULT, ACL_OP_COMPILE_FUZZ } aclOpCompileFlag;

 /**
 * @ingroup AscendCL
 * @brief compile op
 *
 * @param opType [IN]           op type
 * @param numInputs [IN]        number of inputs
 * @param inputDesc [IN]        pointer to array of input tensor descriptions
 * @param numOutputs [IN]       number of outputs
 * @param outputDesc [IN]       pointer to array of output tensor descriptions
 * @param attr [IN]           pointer to instance of aclopAttr.
 *                              may pass nullptr if the op has no attribute
 * @param engineType [IN]       engine type
 * @param compileFlag [IN]      compile flag
 * @param opPath [IN]           path of op
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopCompile(const char *opType, int numInputs, const aclTensorDesc *const inputDesc[],
                                          int numOutputs, const aclTensorDesc *const outputDesc[],
                                          const aclopAttr *attr, aclopEngineType engineType,
                                          aclopCompileType compileFlag, const char *opPath);

 /**
 * @ingroup AscendCL
 * @brief compile and execute op
 *
 * @param opType [IN]           op type
 * @param numInputs [IN]        number of inputs
 * @param inputDesc [IN]        pointer to array of input tensor descriptions
 * @param inputs [IN]           pointer to array of input buffers
 * @param numOutputs [IN]       number of outputs
 * @param outputDesc [IN]       pointer to array of output tensor descriptions
 * @param outputs [IN]          pointer to array of outputs buffers
 * @param attr [IN]             pointer to instance of aclopAttr.
 *                              may pass nullptr if the op has no attribute
 * @param engineType [IN]       engine type
 * @param compileFlag [IN]      compile flag
 * @param opPath [IN]           path of op
 * @param stream [IN]           stream handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopCompileAndExecute(
  const char *opType, int numInputs, const aclTensorDesc *const inputDesc[], const aclDataBuffer *const inputs[],
  int numOutputs, const aclTensorDesc *const outputDesc[], aclDataBuffer *const outputs[], const aclopAttr *attr,
  aclopEngineType engineType, aclopCompileType compileFlag, const char *opPath, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief set compile option
 *
 * @param aclCompileOpt [IN]      compile option
 * @param value [IN]              pointer for the option value
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclSetCompileopt(aclCompileOpt opt, const char *value);

 /**
 * @ingroup AscendCL
 * @brief set compile flag
 *
 * @param flag [IN]    compile flag, ACL_OP_COMPILE_DEFAULT means compile with default mode
 *                     ACL_OP_COMPILE_FUZZ means compile with fuzz mode
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclopSetCompileFlag(aclOpCompileFlag flag);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_ACL_OP_COMPILER_H_
--- a/inc/external/acl/acl_prof.h
+++ b/inc/external/acl/acl_prof.h
@@ -0,0 +1,329 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_PROF_H_
 #define INC_EXTERNAL_ACL_PROF_H_

 #include "acl_base.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #define ACL_PROF_ACL_API 0x0001
 #define ACL_PROF_TASK_TIME 0x0002
 #define ACL_PROF_AICORE_METRICS 0x0004
 #define ACL_PROF_AICPU 0x0008

 /**
 * @deprecated please use aclprofGetOpTypeLen and aclprofGetOpTNameLen instead
 */
 #define ACL_PROF_MAX_OP_NAME_LEN 257
 #define ACL_PROF_MAX_OP_TYPE_LEN 65

 typedef enum {
  ACL_AICORE_ARITHMETIC_UTILIZATION = 0,
  ACL_AICORE_PIPE_UTILIZATION = 1,
  ACL_AICORE_MEMORY_BANDWIDTH = 2,
  ACL_AICORE_L0B_AND_WIDTH = 3,
  ACL_AICORE_RESOURCE_CONFLICT_RATIO = 4,
  ACL_AICORE_NONE = 0xFF
 } aclprofAicoreMetrics;

 typedef struct aclprofConfig aclprofConfig;
 typedef struct aclprofStopConfig aclprofStopConfig;
 typedef struct aclprofAicoreEvents aclprofAicoreEvents;
 typedef struct aclprofSubscribeConfig aclprofSubscribeConfig;

 /**
 * @ingroup AscendCL
 * @brief profiling initialize
 *
 * @param  profilerResultPath [IN]  path of profiling result
 * @param  length [IN]              length of profilerResultPath
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofFinalize
 */
 ACL_FUNC_VISIBILITY aclError aclprofInit(const char *profilerResultPath, size_t length);

 /**
 * @ingroup AscendCL
 * @brief profiling finalize
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofInit
 */
 ACL_FUNC_VISIBILITY aclError aclprofFinalize();

 /**
 * @ingroup AscendCL
 * @brief Start profiling modules by profilerConfig
 *
 * @param  profilerConfig [IN]  config of profiling
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofStop
 */
 ACL_FUNC_VISIBILITY aclError aclprofStart(const aclprofConfig *profilerConfig);

 /**
 * @ingroup AscendCL
 * @brief Create data of type aclprofConfig
 *
 * @param  deviceIdList [IN]      list of device id
 * @param  deviceNums [IN]        number of devices
 * @param  aicoreMetrics [IN]     type of aicore metrics
 * @param  aicoreEvents [IN]      pointer to aicore events, only support NULL now
 * @param  dataTypeConfig [IN]    config modules need profiling
 *
 * @retval the aclprofConfig pointer
 *
 * @see aclprofDestroyConfig
 */
 ACL_FUNC_VISIBILITY aclprofConfig *aclprofCreateConfig(uint32_t *deviceIdList, uint32_t deviceNums,
                                                       aclprofAicoreMetrics aicoreMetrics,
                                                       aclprofAicoreEvents *aicoreEvents, uint64_t dataTypeConfig);

 /**
 * @ingroup AscendCL
 * @brief Destroy data of type aclprofConfig
 *
 * @param  profilerConfig [IN]  config of profiling
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofCreateConfig
 */
 ACL_FUNC_VISIBILITY aclError aclprofDestroyConfig(const aclprofConfig *profilerConfig);

 /**
 * @ingroup AscendCL
 * @brief stop profiling modules by stopProfilingConfig
 *
 * @param  profilerConfig [IN]  pointer to stop config of profiling
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofStart
 */
 ACL_FUNC_VISIBILITY aclError aclprofStop(const aclprofConfig *profilerConfig);

 /**
 * @ingroup AscendCL
 * @brief subscribe profiling data of model
 *
 * @param  modelId [IN]              the model id subscribed
 * @param  profSubscribeConfig [IN]  pointer to config of model subscribe
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofModelUnSubscribe
 */
 ACL_FUNC_VISIBILITY aclError aclprofModelSubscribe(uint32_t modelId, const aclprofSubscribeConfig *profSubscribeConfig);

 /**
 * @ingroup AscendCL
 * @brief unsubscribe profiling data of model
 *
 * @param  modelId [IN]  the model id unsubscribed
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofModelSubscribe
 */
 ACL_FUNC_VISIBILITY aclError aclprofModelUnSubscribe(uint32_t modelId);

 /**
 * @ingroup AscendCL
 * @brief create subscribe config
 *
 * @param  timeInfoSwitch [IN] switch whether get time info from model
 * @param  aicoreMetrics [IN]  aicore metrics
 * @param  fd [IN]             pointer to write pipe
 *
 * @retval the aclprofSubscribeConfig pointer
 *
 * @see aclprofDestroySubscribeConfig
 */
 ACL_FUNC_VISIBILITY aclprofSubscribeConfig *aclprofCreateSubscribeConfig(int8_t timeInfoSwitch,
                                                                         aclprofAicoreMetrics aicoreMetrics, void *fd);

 /**
 * @ingroup AscendCL
 * @brief destroy subscribe config
 *
 * @param  profSubscribeConfig [IN]  subscribe config
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclprofCreateSubscribeConfig
 */
 ACL_FUNC_VISIBILITY aclError aclprofDestroySubscribeConfig(const aclprofSubscribeConfig *profSubscribeConfig);

 /**
 * @ingroup AscendCL
 * @brief create subscribe config
 *
 * @param  opDescSize [OUT]  size of op desc
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclprofGetOpDescSize(size_t *opDescSize);

 /**
 * @ingroup AscendCL
 * @brief get op number from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 * @param  opNumber [OUT]  op number of subscription data
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclprofGetOpNum(const void *opInfo, size_t opInfoLen, uint32_t *opNumber);

 /**
 * @ingroup AscendCL
 * @brief get length op type from subscription data
 *
 * @param  opInfo [IN]      pointer to subscription data
 * @param  opInfoLen [IN]   memory size of subscription data
 * @param  index [IN]       index of op array in opInfo
 * @param  opTypeLen [OUT]  actual length of op type string
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclprofGetOpTypeLen(const void *opInfo, size_t opInfoLen, uint32_t index,
                                                 size_t *opTypeLen);

 /**
 * @ingroup AscendCL
 * @brief get op type from subscription data
 *
 * @param  opInfo [IN]      pointer to subscription data
 * @param  opInfoLen [IN]   memory size of subscription data
 * @param  index [IN]       index of op array in opInfo
 * @param  opType [OUT]     obtained op type string
 * @param  opTypeLen [IN]   obtained length of op type string
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclprofGetOpType(const void *opInfo, size_t opInfoLen, uint32_t index, char *opType,
                                              size_t opTypeLen);

 /**
 * @ingroup AscendCL
 * @brief get length op name from subscription data
 *
 * @param  opInfo [IN]      pointer to subscription data
 * @param  opInfoLen [IN]   memory size of subscription data
 * @param  index [IN]       index of op array in opInfo
 * @param  opNameLen [OUT]  actual length of op name string
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclprofGetOpNameLen(const void *opInfo, size_t opInfoLen, uint32_t index,
                                                 size_t *opNameLen);

 /**
 * @ingroup AscendCL
 * @brief get op type from subscription data
 *
 * @param  opInfo [IN]      pointer to subscription data
 * @param  opInfoLen [IN]   memory size of subscription data
 * @param  index [IN]       index of op array in opInfo
 * @param  opName [OUT]     obtained op name string
 * @param  opNameLen [IN]   obtained length of op name string
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclprofGetOpName(const void *opInfo, size_t opInfoLen, uint32_t index, char *opName,
                                              size_t opNameLen);

 /**
 * @ingroup AscendCL
 * @brief get start time of specified op from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 * @param  index [IN]      index of op array in opInfo
 *
 * @retval start time(us) of specified op with timestamp
 * @retval 0 for failed
 */
 ACL_FUNC_VISIBILITY uint64_t aclprofGetOpStart(const void *opInfo, size_t opInfoLen, uint32_t index);

 /**
 * @ingroup AscendCL
 * @brief get end time of specified op from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 * @param  index [IN]      index of op array in opInfo
 *
 * @retval end time(us) of specified op with timestamp
 * @retval 0 for failed
 */
 ACL_FUNC_VISIBILITY uint64_t aclprofGetOpEnd(const void *opInfo, size_t opInfoLen, uint32_t index);

 /**
 * @ingroup AscendCL
 * @brief get excution time of specified op from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 * @param  index [IN]      index of op array in opInfo
 *
 * @retval execution time(us) of specified op with timestamp
 * @retval 0 for failed
 */
 ACL_FUNC_VISIBILITY uint64_t aclprofGetOpDuration(const void *opInfo, size_t opInfoLen, uint32_t index);

 /**
 * @ingroup AscendCL
 * @brief get model id from subscription data
 *
 * @param  opInfo [IN]     pointer to subscription data
 * @param  opInfoLen [IN]  memory size of subscription data
 *
 * @retval model id of subscription data
 * @retval 0 for failed
 */
 ACL_FUNC_VISIBILITY size_t aclprofGetModelId(const void *opInfo, size_t opInfoLen, uint32_t index);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_PROF_H_
--- a/inc/external/acl/acl_rt.h
+++ b/inc/external/acl/acl_rt.h
@@ -0,0 +1,958 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_ACL_RT_H_
 #define INC_EXTERNAL_ACL_ACL_RT_H_

 #include <stdint.h>
 #include <stddef.h>
 #include "acl_base.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #define ACL_EVENT_TIME_LINE 0x00000008u

 typedef enum aclrtRunMode {
  ACL_DEVICE,
  ACL_HOST,
 } aclrtRunMode;

 typedef enum aclrtTsId {
  ACL_TS_ID_AICORE = 0,
  ACL_TS_ID_AIVECTOR = 1,
  ACL_TS_ID_RESERVED = 2,
 } aclrtTsId;

 typedef enum aclrtEventStatus {
  ACL_EVENT_STATUS_COMPLETE = 0,
  ACL_EVENT_STATUS_NOT_READY = 1,
  ACL_EVENT_STATUS_RESERVED = 2,
 } aclrtEventStatus;

 typedef enum aclrtCallbackBlockType {
  ACL_CALLBACK_NO_BLOCK,
  ACL_CALLBACK_BLOCK,
 } aclrtCallbackBlockType;

 typedef enum aclrtMemcpyKind {
  ACL_MEMCPY_HOST_TO_HOST,
  ACL_MEMCPY_HOST_TO_DEVICE,
  ACL_MEMCPY_DEVICE_TO_HOST,
  ACL_MEMCPY_DEVICE_TO_DEVICE,
 } aclrtMemcpyKind;

 typedef enum aclrtMemMallocPolicy {
  ACL_MEM_MALLOC_HUGE_FIRST,
  ACL_MEM_MALLOC_HUGE_ONLY,
  ACL_MEM_MALLOC_NORMAL_ONLY,
  ACL_MEM_MALLOC_HUGE_FIRST_P2P,
  ACL_MEM_MALLOC_HUGE_ONLY_P2P,
  ACL_MEM_MALLOC_NORMAL_ONLY_P2P,
 } aclrtMemMallocPolicy;

 typedef enum aclrtMemAttr {
  ACL_DDR_MEM,
  ACL_HBM_MEM,
  ACL_DDR_MEM_HUGE,
  ACL_DDR_MEM_NORMAL,
  ACL_HBM_MEM_HUGE,
  ACL_HBM_MEM_NORMAL,
  ACL_DDR_MEM_P2P_HUGE,
  ACL_DDR_MEM_P2P_NORMAL,
  ACL_HBM_MEM_P2P_HUGE,
  ACL_HBM_MEM_P2P_NORMAL,
 } aclrtMemAttr;

 typedef enum aclrtGroupAttr {
  ACL_GROUP_AICORE_INT,
  ACL_GROUP_AIV_INT,
  ACL_GROUP_AIC_INT,
  ACL_GROUP_SDMANUM_INT,
  ACL_GROUP_ASQNUM_INT,
  ACL_GROUP_GROUPID_INT
 } aclrtGroupAttr;

 typedef struct tagRtGroupInfo aclrtGroupInfo;

 typedef struct rtExceptionInfo aclrtExceptionInfo;

 typedef void (*aclrtCallback)(void *userData);

 typedef void (*aclrtExceptionInfoCallback)(aclrtExceptionInfo *exceptionInfo);

 /**
 * @ingroup AscendCL
 * @brief Set a callback function to handle exception information
 *
 * @param callback [IN] callback function to handle exception information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSetExceptionInfoCallback(aclrtExceptionInfoCallback callback);

 /**
 * @ingroup AscendCL
 * @brief Get task id from exception information
 *
 * @param info [IN]   pointer of exception information
 *
 * @retval The task id from exception information
 * @retval 0xFFFFFFFF if info is null
 */
 ACL_FUNC_VISIBILITY uint32_t aclrtGetTaskIdFromExceptionInfo(const aclrtExceptionInfo *info);

 /**
 * @ingroup AscendCL
 * @brief Get stream id from exception information
 *
 * @param info [IN]   pointer of exception information
 *
 * @retval The stream id from exception information
 * @retval 0xFFFFFFFF if info is null
 */
 ACL_FUNC_VISIBILITY uint32_t aclrtGetStreamIdFromExceptionInfo(const aclrtExceptionInfo *info);

 /**
 * @ingroup AscendCL
 * @brief Get thread id from exception information
 *
 * @param info [IN]   pointer of exception information
 *
 * @retval The thread id of fail task
 * @retval 0xFFFFFFFF if info is null
 */
 ACL_FUNC_VISIBILITY uint32_t aclrtGetThreadIdFromExceptionInfo(const aclrtExceptionInfo *info);

 /**
 * @ingroup AscendCL
 * @brief Get device id from exception information
 *
 * @param info [IN]   pointer of exception information
 *
 * @retval The thread id of fail task
 * @retval 0xFFFFFFFF if info is null
 */
 ACL_FUNC_VISIBILITY uint32_t aclrtGetDeviceIdFromExceptionInfo(const aclrtExceptionInfo *info);

 /**
 * @ingroup AscendCL
 * @brief The thread that handles the callback function on the Stream
 *
 * @param threadId [IN] thread ID
 * @param stream [IN]   stream handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSubscribeReport(uint64_t threadId, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief Add a callback function to be executed on the host
 *        to the task queue of the Stream
 *
 * @param fn [IN]   Specify the callback function to be added
 *                  The function prototype of the callback function is:
 *                  typedef void (*aclrtCallback)(void *userData);
 * @param userData [IN]   User data to be passed to the callback function
 * @param blockType [IN]  callback block type
 * @param stream [IN]     stream handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtLaunchCallback(aclrtCallback fn, void *userData, aclrtCallbackBlockType blockType,
                                                 aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief After waiting for a specified time, trigger callback processing
 *
 * @par Function
 *  The thread processing callback specified by
 *  the aclrtSubscribeReport interface
 *
 * @param timeout [IN]   timeout value
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtSubscribeReport
 */
 ACL_FUNC_VISIBILITY aclError aclrtProcessReport(int32_t timeout);

 /**
 * @ingroup AscendCL
 * @brief Cancel thread registration,
 *        the callback function on the specified Stream
 *        is no longer processed by the specified thread
 *
 * @param threadId [IN]   thread ID
 * @param stream [IN]     stream handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtUnSubscribeReport(uint64_t threadId, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create context and associates it with the calling thread
 *
 * @par Function
 * The following use cases are supported:
 * @li If you don't call the aclrtCreateContext interface
 * to explicitly create the context,
 * the system will use the default context, which is implicitly created
 * when the aclrtSetDevice interface is called.
 * @li If multiple contexts are created in a process
 * (there is no limit on the number of contexts),
 * the current thread can only use one of them at the same time.
 * It is recommended to explicitly specify the context of the current thread
 * through the aclrtSetCurrentContext interface to increase.
 * the maintainability of the program.
 *
 * @param  context [OUT]    point to the created context
 * @param  deviceId [IN]    device to create context on
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtSetDevice | aclrtSetCurrentContext
 */
 ACL_FUNC_VISIBILITY aclError aclrtCreateContext(aclrtContext *context, int32_t deviceId);

 /**
 * @ingroup AscendCL
 * @brief destroy context instance
 *
 * @par Function
 * Can only destroy context created through aclrtCreateContext interface
 *
 * @param  context [IN]   the context to destroy
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtCreateContext
 */
 ACL_FUNC_VISIBILITY aclError aclrtDestroyContext(aclrtContext context);

 /**
 * @ingroup AscendCL
 * @brief set the context of the thread
 *
 * @par Function
 * The following scenarios are supported:
 * @li If the aclrtCreateContext interface is called in a thread to explicitly
 * create a Context (for example: ctx1), the thread's Context can be specified
 * without calling the aclrtSetCurrentContext interface.
 * The system uses ctx1 as the context of thread1 by default.
 * @li If the aclrtCreateContext interface is not explicitly created,
 * the system uses the default context as the context of the thread.
 * At this time, the aclrtDestroyContext interface cannot be used to release
 * the default context.
 * @li If the aclrtSetCurrentContext interface is called multiple times to
 * set the thread's Context, the last one prevails.
 *
 * @par Restriction
 * @li If the cevice corresponding to the context set for the thread
 * has been reset, you cannot set the context as the context of the thread,
 * otherwise a business exception will result.
 * @li It is recommended to use the context created in a thread.
 * If the aclrtCreateContext interface is called in thread A to create a context,
 * and the context is used in thread B,
 * the user must guarantee the execution order of tasks in the same stream
 * under the same context in two threads.
 *
 * @param  context [IN]   the current context of the thread
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtCreateContext | aclrtDestroyContext
 */
 ACL_FUNC_VISIBILITY aclError aclrtSetCurrentContext(aclrtContext context);

 /**
 * @ingroup AscendCL
 * @brief get the context of the thread
 *
 * @par Function
 * If the user calls the aclrtSetCurrentContext interface
 * multiple times to set the context of the current thread,
 * then the last set context is obtained
 *
 * @param  context [OUT]   the current context of the thread
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtSetCurrentContext
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetCurrentContext(aclrtContext *context);

 /**
 * @ingroup AscendCL
 * @brief Specify the device to use for the operation
 * implicitly create the default context and the default stream
 *
 * @par Function
 * The following use cases are supported:
 * @li Device can be specified in the process or thread.
 * If you call the aclrtSetDevice interface multiple
 * times to specify the same device,
 * you only need to call the aclrtResetDevice interface to reset the device.
 * @li The same device can be specified for operation
 *  in different processes or threads.
 * @li Device is specified in a process,
 * and multiple threads in the process can share this device to explicitly
 * create a Context (aclrtCreateContext interface).
 * @li In multi-device scenarios, you can switch to other devices
 * through the aclrtSetDevice interface in the process.
 *
 * @param  deviceId [IN]  the device id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtResetDevice |aclrtCreateContext
 */
 ACL_FUNC_VISIBILITY aclError aclrtSetDevice(int32_t deviceId);

 /**
 * @ingroup AscendCL
 * @brief Reset the current operating Device and free resources on the device,
 * including the default context, the default stream,
 * and all streams created under the default context,
 * and synchronizes the interface.
 * If the task under the default context or stream has not been completed,
 * the system will wait for the task to complete before releasing it.
 *
 * @par Restriction
 * @li The Context, Stream, and Event that are explicitly created
 * on the device to be reset. Before resetting,
 * it is recommended to follow the following interface calling sequence,
 * otherwise business abnormalities may be caused.
 * @li Interface calling sequence:
 * call aclrtDestroyEvent interface to release Event or
 * call aclrtDestroyStream interface to release explicitly created Stream->
 * call aclrtDestroyContext to release explicitly created Context->
 * call aclrtResetDevice interface
 *
 * @param  deviceId [IN]   the device id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtResetDevice(int32_t deviceId);

 /**
 * @ingroup AscendCL
 * @brief get target device of current thread
 *
 * @param deviceId [OUT]  the device id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetDevice(int32_t *deviceId);

 /**
 * @ingroup AscendCL
 * @brief get target side
 *
 * @param runMode [OUT]    the run mode
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetRunMode(aclrtRunMode *runMode);

 /**
 * @ingroup AscendCL
 * @brief Wait for compute device to finish
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSynchronizeDevice(void);

 /**
 * @ingroup AscendCL
 * @brief Set Scheduling TS
 *
 * @param tsId [IN]   the ts id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSetTsDevice(aclrtTsId tsId);

 /**
 * @ingroup AscendCL
 * @brief get total device number.
 *
 * @param count [OUT]    the device number
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetDeviceCount(uint32_t *count);

 /**
 * @ingroup AscendCL
 * @brief create event instance
 *
 * @param event [OUT]   created event
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtCreateEvent(aclrtEvent *event);

 /**
 * @ingroup AscendCL
 * @brief create event instance with flag
 *
 * @param event [OUT]   created event
 * @param flag [IN]     event flag
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag);

 /**
 * @ingroup AscendCL
 * @brief destroy event instance
 *
 * @par Function
 *  Only events created through the aclrtCreateEvent interface can be
 *  destroyed, synchronous interfaces. When destroying an event,
 *  the user must ensure that the tasks involved in the aclrtSynchronizeEvent
 *  interface or the aclrtStreamWaitEvent interface are completed before
 *  they are destroyed.
 *
 * @param  event [IN]   event to destroy
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtCreateEvent | aclrtSynchronizeEvent | aclrtStreamWaitEvent
 */
 ACL_FUNC_VISIBILITY aclError aclrtDestroyEvent(aclrtEvent event);

 /**
 * @ingroup AscendCL
 * @brief Record an Event in the Stream
 *
 * @param event [IN]    event to record
 * @param stream [IN]   stream handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtRecordEvent(aclrtEvent event, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief Reset an event
 *
 * @par Function
 *  Users need to make sure to wait for the tasks in the Stream
 *  to complete before resetting the Event
 *
 * @param event [IN]    event to reset
 * @param stream [IN]   stream handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtResetEvent(aclrtEvent event, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief Queries an event's status
 *
 * @param  event [IN]    event to query
 * @param  status [OUT]  event status
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtQueryEvent(aclrtEvent event, aclrtEventStatus *status);

 /**
 * @ingroup AscendCL
 * @brief Block Host Running, wait event to be complete
 *
 * @param  event [IN]   event to wait
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSynchronizeEvent(aclrtEvent event);

 /**
 * @ingroup AscendCL
 * @brief computes the elapsed time between events.
 *
 * @param ms [OUT]     time between start and end in ms
 * @param start [IN]   starting event
 * @param end [IN]     ending event
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtCreateEvent | aclrtRecordEvent | aclrtSynchronizeStream
 */
 ACL_FUNC_VISIBILITY aclError aclrtEventElapsedTime(float *ms, aclrtEvent start, aclrtEvent end);

 /**
 * @ingroup AscendCL
 * @brief alloc memory on device
 *
 * @par Function
 *  alloc for size linear memory on device
 *  and return a pointer to allocated memory by *devPtr
 *
 * @par Restriction
 * @li The memory requested by the aclrtMalloc interface needs to be released
 * through the aclrtFree interface.
 * @li Before calling the media data processing interface,
 * if you need to apply memory on the device to store input or output data,
 * you need to call acldvppMalloc to apply for memory.
 *
 * @param devPtr [OUT]  pointer to pointer to allocated memory on device
 * @param size [IN]     alloc memory size
 * @param policy [IN]   memory alloc policy
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtFree | acldvppMalloc | aclrtMallocCached
 */
 ACL_FUNC_VISIBILITY aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy);

 /**
 * @ingroup AscendCL
 * @brief allocate memory on device with cache
 *
 * @par Function
 *  alloc for size linear memory on device
 *  and return a pointer to allocated memory by *devPtr
 *
 * @par Restriction
 * @li The memory requested by the aclrtMallocCached interface needs to be released
 * through the aclrtFree interface.
 *
 * @param devPtr [OUT]  pointer to pointer to allocated memory on device
 * @param size [IN]     alloc memory size
 * @param policy [IN]   memory alloc policy
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtFree | aclrtMalloc
 */
 ACL_FUNC_VISIBILITY aclError aclrtMallocCached(void **devPtr, size_t size, aclrtMemMallocPolicy policy);

 /**
 * @ingroup AscendCL
 * @brief flush cache data to ddr
 *
 * @param devPtr [IN]  the pointer that flush data to ddr
 * @param size [IN]    flush size
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtMemFlush(void *devPtr, size_t size);

 /**
 * @ingroup AscendCL
 * @brief invalidate cache data
 *
 * @param devPtr [IN]  pointer to invalidate cache data
 * @param size [IN]    invalidate size
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtMemInvalidate(void *devPtr, size_t size);

 /**
 * @ingroup AscendCL
 * @brief free device memory
 *
 * @par Function
 *  can only free memory allocated through the aclrtMalloc interface
 *
 * @param  devPtr [IN]  Pointer to memory to be freed
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtMalloc
 */
 ACL_FUNC_VISIBILITY aclError aclrtFree(void *devPtr);

 /**
 * @ingroup AscendCL
 * @brief alloc memory on host
 *
 * @par Restriction
 * @li The requested memory cannot be used in the Device
 * and needs to be explicitly copied to the Device.
 * @li The memory requested by the aclrtMallocHost interface
 * needs to be released through the aclrtFreeHost interface.
 *
 * @param  hostPtr [OUT] pointer to pointer to allocated memory on the host
 * @param  size [IN]     alloc memory size
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtFreeHost
 */
 ACL_FUNC_VISIBILITY aclError aclrtMallocHost(void **hostPtr, size_t size);

 /**
 * @ingroup AscendCL
 * @brief free host memory
 *
 * @par Function
 *  can only free memory allocated through the aclrtMallocHost interface
 *
 * @param  hostPtr [IN]   free memory pointer
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtMallocHost
 */
 ACL_FUNC_VISIBILITY aclError aclrtFreeHost(void *hostPtr);

 /**
 * @ingroup AscendCL
 * @brief synchronous memory replication between host and device
 *
 * @param dst [IN]       destination address pointer
 * @param destMax [IN]   Max length of the destination address memory
 * @param src [IN]       source address pointer
 * @param count [IN]     the length of byte to copy
 * @param kind [IN]      memcpy type
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtMemcpy(void *dst, size_t destMax, const void *src, size_t count,
                                         aclrtMemcpyKind kind);

 /**
 * @ingroup AscendCL
 * @brief Initialize memory and set contents of memory to specified value
 *
 * @par Function
 *  The memory to be initialized is on the Host or device side,
 *  and the system determines whether
 *  it is host or device according to the address
 *
 * @param devPtr [IN]    Starting address of memory
 * @param maxCount [IN]  Max length of destination address memory
 * @param value [IN]     Set value
 * @param count [IN]     The length of memory
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtMemset(void *devPtr, size_t maxCount, int32_t value, size_t count);

 /**
 * @ingroup AscendCL
 * @brief  Asynchronous memory replication between Host and Device
 *
 * @par Function
 *  After calling this interface,
 *  be sure to call the aclrtSynchronizeStream interface to ensure that
 *  the task of memory replication has been completed
 *
 * @par Restriction
 * @li For on-chip Device-to-Device memory copy,
 *     both the source and destination addresses must be 64-byte aligned
 *
 * @param dst [IN]     destination address pointer
 * @param destMax [IN] Max length of destination address memory
 * @param src [IN]     source address pointer
 * @param count [IN]   the number of byte to copy
 * @param kind [IN]    memcpy type
 * @param stream [IN]  asynchronized task stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtSynchronizeStream
 */
 ACL_FUNC_VISIBILITY aclError aclrtMemcpyAsync(void *dst, size_t destMax, const void *src, size_t count,
                                              aclrtMemcpyKind kind, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief Asynchronous initialize memory
 * and set contents of memory to specified value async
 *
 * @par Function
 *  The memory to be initialized is on the Host or device side,
 *  and the system determines whether
 *  it is host or device according to the address
 *
 * @param devPtr [IN]      destination address pointer
 * @param maxCount [IN]    Max length of destination address memory
 * @param value [IN]       set value
 * @param count [IN]       the number of byte to set
 * @param stream [IN]      asynchronized task stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtSynchronizeStream
 */
 ACL_FUNC_VISIBILITY aclError aclrtMemsetAsync(void *devPtr, size_t maxCount, int32_t value, size_t count,
                                              aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief  create stream instance
 *
 * @param  stream [OUT]   the created stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtCreateStream(aclrtStream *stream);

 /**
 * @ingroup AscendCL
 * @brief destroy stream instance
 *
 * @par Function
 * Can only destroy streams created through the aclrtCreateStream interface
 *
 * @par Restriction
 * Before calling the aclrtDestroyStream interface to destroy
 * the specified Stream, you need to call the aclrtSynchronizeStream interface
 * to ensure that the tasks in the Stream have been completed.
 *
 * @param stream [IN]  the stream to destroy
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtCreateStream | aclrtSynchronizeStream
 */
 ACL_FUNC_VISIBILITY aclError aclrtDestroyStream(aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief block the host until all tasks
 * in the specified stream have completed
 *
 * @param  stream [IN]   the stream to wait
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSynchronizeStream(aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief Blocks the operation of the specified Stream until
 * the specified Event is completed.
 * Support for multiple streams waiting for the same event.
 *
 * @param  stream [IN]   the wait stream If using thedefault Stream, set NULL
 * @param  event [IN]    the event to wait
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtStreamWaitEvent(aclrtStream stream, aclrtEvent event);

 /**
 * @ingroup AscendCL
 * @brief set group
 *
 * @par Function
 *  set the task to the corresponding group
 *
 * @param groupId [IN]   group id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtGetGroupCount | aclrtGetAllGroupInfo | aclrtGetGroupInfoDetail
 */
 ACL_FUNC_VISIBILITY aclError aclrtSetGroup(int32_t groupId);

 /**
 * @ingroup AscendCL
 * @brief get the number of group
 *
 * @par Function
 *  get the number of group. if the number of group is zero,
 *  it means that group is not supported or group is not created.
 *
 * @param count [OUT]   the number of group
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetGroupCount(uint32_t *count);

 /**
 * @ingroup AscendCL
 * @brief create group information
 *
 * @retval null for failed.
 * @retval OtherValues success.
 *
 * @see aclrtDestroyGroupInfo
 */
 ACL_FUNC_VISIBILITY aclrtGroupInfo *aclrtCreateGroupInfo();

 /**
 * @ingroup AscendCL
 * @brief destroy group information
 *
 * @param groupInfo [IN]   pointer to group information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtCreateGroupInfo
 */
 ACL_FUNC_VISIBILITY aclError aclrtDestroyGroupInfo(aclrtGroupInfo *groupInfo);

 /**
 * @ingroup AscendCL
 * @brief get all group information
 *
 * @param groupInfo [OUT]   pointer to group information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtGetGroupCount
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetAllGroupInfo(aclrtGroupInfo *groupInfo);

 /**
 * @ingroup AscendCL
 * @brief get detail information of group
 *
 * @param groupInfo [IN]    pointer to group information
 * @param groupIndex [IN]   group index value
 * @param attr [IN]         group attribute
 * @param attrValue [OUT]   pointer to attribute value
 * @param valueLen [IN]     length of attribute value
 * @param paramRetSize [OUT]   pointer to real length of attribute value
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtGetGroupCount | aclrtGetAllGroupInfo
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetGroupInfoDetail(const aclrtGroupInfo *groupInfo, int32_t groupIndex,
                                                     aclrtGroupAttr attr, void *attrValue, size_t valueLen,
                                                     size_t *paramRetSize);

 /**
 * @ingroup AscendCL
 * @brief checking whether current device and peer device support the p2p feature
 *
 * @param canAccessPeer [OUT]   pointer to save the checking result
 * @param deviceId [IN]         current device id
 * @param peerDeviceId [IN]     peer device id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtDeviceEnablePeerAccess | aclrtDeviceDisablePeerAccess
 */
 ACL_FUNC_VISIBILITY aclError aclrtDeviceCanAccessPeer(int32_t *canAccessPeer, int32_t deviceId, int32_t peerDeviceId);

 /**
 * @ingroup AscendCL
 * @brief enable the peer device to support the p2p feature
 *
 * @param peerDeviceId [IN]   the peer device id
 * @param flags [IN]   reserved field, now it must be zero
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtDeviceCanAccessPeer | aclrtDeviceDisablePeerAccess
 */
 ACL_FUNC_VISIBILITY aclError aclrtDeviceEnablePeerAccess(int32_t peerDeviceId, uint32_t flags);

 /**
 * @ingroup AscendCL
 * @brief disable the peer device to support the p2p function
 *
 * @param peerDeviceId [IN]   the peer device id
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclrtDeviceCanAccessPeer | aclrtDeviceEnablePeerAccess
 */
 ACL_FUNC_VISIBILITY aclError aclrtDeviceDisablePeerAccess(int32_t peerDeviceId);

 /**
 * @ingroup AscendCL
 * @brief Obtain the free memory and total memory of specified attribute.
 * the specified memory include normal memory and huge memory.
 *
 * @param attr [IN]    the memory attribute of specified device
 * @param free [OUT]   the free memory of specified device
 * @param total [OUT]  the total memory of specified device.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtGetMemInfo(aclrtMemAttr attr, size_t *free, size_t *total);

 /**
 * @ingroup AscendCL
 * @brief Set the timeout interval for waitting of op
 *
 * @param timeout [IN]   op wait timeout
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclrtSetOpWaitTimeout(uint32_t timeout);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_ACL_RT_H_
--- a/inc/external/acl/acl_tdt.h
+++ b/inc/external/acl/acl_tdt.h
@@ -0,0 +1,276 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_ACL_TDT_H_
 #define INC_EXTERNAL_ACL_ACL_TDT_H_

 #include "acl/acl_base.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 enum acltdtTensorType {
  ACL_TENSOR_DATA_UNDEFINED = -1,
  ACL_TENSOR_DATA_TENSOR,
  ACL_TENSOR_DATA_END_OF_SEQUENCE,
  ACL_TENSOR_DATA_ABNORMAL
 };

 typedef struct acltdtDataItem acltdtDataItem;
 typedef struct acltdtDataset acltdtDataset;
 typedef struct acltdtChannelHandle acltdtChannelHandle;

 /**
 * @ingroup AscendCL
 * @brief Get tensor type from item
 *
 * @param dataItem [IN] pointer to the data item
 *
 * @retval Tensor type.
 * @retval ACL_DT_UNDEFINED if dataItem is null
 */
 ACL_FUNC_VISIBILITY acltdtTensorType acltdtGetTensorTypeFromItem(const acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Get data type from item
 *
 * @param dataItem [IN] pointer to the data item
 *
 * @retval Data type.
 * @retval ACL_DT_UNDEFINED if dataItem is null
 */
 ACL_FUNC_VISIBILITY aclDataType acltdtGetDataTypeFromItem(const acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Get data address from item
 *
 * @param dataItem [IN] pointer to data item
 *
 * @retval null for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY void *acltdtGetDataAddrFromItem(const acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Get data size from item
 *
 * @param dataItem [IN] pointer to data item
 *
 * @retval 0 for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY size_t acltdtGetDataSizeFromItem(const acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Get dim's number from item
 *
 * @param dataItem [IN] pointer to data item
 *
 * @retval 0 for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY size_t acltdtGetDimNumFromItem(const acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Get dims from item
 *
 * @param  dataItem [IN]      the struct of data item
 * @param  dims [IN|OUT]      pointer to the dims of dataTtem
 * @param  dimNum [IN]        the size of the dims
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError acltdtGetDimsFromItem(const acltdtDataItem *dataItem, int64_t *dims, size_t dimNum);

 /**
 * @ingroup AscendCL
 * @brief Create the struct of data item
 *
 * @param tdtType [IN]  Tdt tensor type
 * @param dims [IN]     pointer of tdtDataItem's dims
 * @param dimNum [IN]   Dim number
 * @param dataType [IN] Data type
 * @param data [IN]     Data pointer
 * @param size [IN]     Data size
 *
 * @retval null for failed
 * @retval OtherValues success
 *
 * @see acltdtDestroyDataItem
 */
 ACL_FUNC_VISIBILITY acltdtDataItem *acltdtCreateDataItem(acltdtTensorType tdtType, const int64_t *dims, size_t dimNum,
                                                         aclDataType dataType, void *data, size_t size);

 /**
 * @ingroup AscendCL
 * @brief Destroy the struct of data item
 *
 * @param dataItem [IN]  pointer to the data item
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtCreateDataItem
 */
 ACL_FUNC_VISIBILITY aclError acltdtDestroyDataItem(acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Create the tdt dataset
 *
 * @retval null for failed
 * @retval OtherValues success
 *
 * @see acltdtDestroyDataset
 */
 ACL_FUNC_VISIBILITY acltdtDataset *acltdtCreateDataset();

 /**
 * @ingroup AscendCL
 * @brief Destroy the tdt dataset
 *
 * @param dataset [IN]  pointer to the dataset
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtCreateDataset
 */
 ACL_FUNC_VISIBILITY aclError acltdtDestroyDataset(acltdtDataset *dataset);

 /**
 * @ingroup AscendCL
 * @brief Get the data item
 *
 * @param dataset [IN] pointer to the dataset
 * @param index [IN]   index of the dataset
 *
 * @retval null for failed
 * @retval OtherValues success
 *
 * @see acltdtAddDataItem
 */
 ACL_FUNC_VISIBILITY acltdtDataItem *acltdtGetDataItem(const acltdtDataset *dataset, size_t index);

 /**
 * @ingroup AscendCL
 * @brief Get the data item
 *
 * @param dataset [OUT] pointer to the dataset
 * @param dataItem [IN] pointer to the data item
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtGetDataItem
 */
 ACL_FUNC_VISIBILITY aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDataItem *dataItem);

 /**
 * @ingroup AscendCL
 * @brief Get the size of dataset
 *
 * @param dataset [IN]  pointer to the dataset
 *
 * @retval 0 for failed
 * @retval OtherValues success
 */
 ACL_FUNC_VISIBILITY size_t acltdtGetDatasetSize(const acltdtDataset *dataset);

 /**
 * @ingroup AscendCL
 * @brief Stop the channel
 *
 * @param handle [IN]  pointer to the channel handle
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtCreateChannel | acltdtDestroyChannel
 */
 ACL_FUNC_VISIBILITY aclError acltdtStopChannel(acltdtChannelHandle *handle);

 /**
 * @ingroup AscendCL
 * @brief Create the channel
 *
 * @param deviceId [IN]  the device id
 * @param name [IN]      the channel's name
 *
 * @retval null for failed
 * @retval OtherValues success
 *
 * @see acltdtStopChannel | acltdtDestroyChannel
 */
 ACL_FUNC_VISIBILITY acltdtChannelHandle *acltdtCreateChannel(uint32_t deviceId, const char *name);

 /**
 * @ingroup AscendCL
 * @brief Destroy the channel
 *
 * @param handle [IN]  pointer to the channel handle
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtCreateChannel | acltdtStopChannel
 */
 ACL_FUNC_VISIBILITY aclError acltdtDestroyChannel(acltdtChannelHandle *handle);

 /**
 * @ingroup AscendCL
 * @brief Send tensor to device
 *
 * @param handle [IN]  pointer to the channel handle
 * @param dataset [IN] pointer to the dataset
 * @param timeout [IN] to be reserved, now it must be -1
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtReceiveTensor
 */
 ACL_FUNC_VISIBILITY aclError acltdtSendTensor(const acltdtChannelHandle *handle, const acltdtDataset *dataset,
                                              int32_t timeout);

 /**
 * @ingroup AscendCL
 * @brief Receive tensor from device
 *
 * @param handle [IN]      pointer to the channel handle
 * @param dataset [OUT]    pointer to the dataset
 * @param timeout [IN]     to be reserved, now it must be -1
 *
 * @retval ACL_SUCCESS  The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acltdtSendTensor
 */
 ACL_FUNC_VISIBILITY aclError acltdtReceiveTensor(const acltdtChannelHandle *handle, acltdtDataset *dataset,
                                                 int32_t timeout);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_ACL_TDT_H_
--- a/inc/external/acl/error_codes/ge_error_codes.h
+++ b/inc/external/acl/error_codes/ge_error_codes.h
@@ -0,0 +1,75 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_GE_GE_ERROR_CODES_H_
 #define INC_EXTERNAL_GE_GE_ERROR_CODES_H_

 #if defined(_MSC_VER)
 #ifdef FUNC_VISIBILITY
 #define GE_FUNC_VISIBILITY _declspec(dllexport)
 #else
 #define GE_FUNC_VISIBILITY
 #endif
 #else
 #ifdef FUNC_VISIBILITY
 #define GE_FUNC_VISIBILITY __attribute__((visibility("default")))
 #else
 #define GE_FUNC_VISIBILITY
 #endif
 #endif

 #include <stddef.h>

 #ifdef __cplusplus
 extern "C" {
 #endif
 static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000;
 static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001;
 static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002;
 static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003;
 static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006;
 static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007;
 static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009;
 static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011;
 static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012;
 static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013;
 static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014;
 static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015;
 static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016;
 static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017;
 static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018;
 static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019;
 static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020;
 static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021;
 static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022;
 static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000;
 static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001;
 static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000;
 static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004;
 static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005;
 static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006;
 static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007;
 static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008;
 static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009;

 #ifdef __cplusplus
 }  // namespace ge
 #endif
 #endif  // INC_EXTERNAL_GE_GE_ERROR_CODES_H_
--- a/inc/external/acl/error_codes/rt_error_codes.h
+++ b/inc/external/acl/error_codes/rt_error_codes.h
@@ -0,0 +1,109 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef __INC_EXTERNEL_RT_ERROR_CODES_H__
 #define __INC_EXTERNEL_RT_ERROR_CODES_H__

 #include <stddef.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 static const int32_t ACL_RT_SUCCESS = 0;  // success

 static const int32_t ACL_ERROR_RT_PARAM_INVALID = 107000;             // param invalid
 static const int32_t ACL_ERROR_RT_INVALID_DEVICEID = 107001;          // invalid device id
 static const int32_t ACL_ERROR_RT_CONTEXT_NULL = 107002;              // current context null
 static const int32_t ACL_ERROR_RT_STREAM_CONTEXT = 107003;            // stream not in current context
 static const int32_t ACL_ERROR_RT_MODEL_CONTEXT = 107004;             // model not in current context
 static const int32_t ACL_ERROR_RT_STREAM_MODEL = 107005;              // stream not in model
 static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID = 107006;   // event timestamp invalid
 static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL = 107007;  // event timestamp reversal
 static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED = 107008;            // memory address unaligned
 static const int32_t ACL_ERROR_RT_FILE_OPEN = 107009;                 // open file failed
 static const int32_t ACL_ERROR_RT_FILE_WRITE = 107010;                // write file failed
 static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE = 107011;          // error subscribe stream
 static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE = 107012;          // error subscribe thread
 static const int32_t ACL_ERROR_RT_GROUP_NOT_SET = 107013;             // group not set
 static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE = 107014;          // group not create
 static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG = 107015;          // callback not register to stream
 static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016;       // invalid memory type
 static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017;            // invalid handle
 static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018;       // invalid malloc type
 static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019;              // wait timeout

 static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000;  // feature not support
 static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001;    // memory allocation error
 static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002;          // memory free error
 static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003;     // aicore over flow
 static const int32_t ACL_ERROR_RT_NO_DEVICE = 207004;            // no device
 static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL = 207005;  // resource alloc fail
 static const int32_t ACL_ERROR_RT_NO_PERMISSION = 207006;        // no permission
 static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE = 207007;    // no event resource
 static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE = 207008;   // no stream resource
 static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE = 207009;   // no notify resource
 static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE = 207010;    // no model resource
 static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011;      // no cdq resource

 static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000;              // runtime internal error
 static const int32_t ACL_ERROR_RT_TS_ERROR = 507001;                    // ts internel error
 static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL = 507002;            // task full in stream
 static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY = 507003;           // task empty in stream
 static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE = 507004;         // stream not complete
 static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE = 507005;             // end of sequence
 static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE = 507006;          // event not complete
 static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR = 507007;       // context release error
 static const int32_t ACL_ERROR_RT_SOC_VERSION = 507008;                 // soc version error
 static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT = 507009;       // task type not support
 static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT = 507010;              // ts lost heartbeat
 static const int32_t ACL_ERROR_RT_MODEL_EXECUTE = 507011;               // model execute failed
 static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT = 507012;              // report timeout
 static const int32_t ACL_ERROR_RT_SYS_DMA = 507013;                     // sys dma error
 static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT = 507014;              // aicore timeout
 static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION = 507015;            // aicore exception
 static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION = 507016;       // aicore trap exception
 static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT = 507017;               // aicpu timeout
 static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION = 507018;             // aicpu exception
 static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR = 507019;      // aicpu datadump response error
 static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR = 507020;         // aicpu model operate response error
 static const int32_t ACL_ERROR_RT_PROFILING_ERROR = 507021;             // profiling error
 static const int32_t ACL_ERROR_RT_IPC_ERROR = 507022;                   // ipc error
 static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL = 507023;          // model abort normal
 static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING = 507024;        // kernel unregistering
 static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT = 507025;         // ringbuffer not init
 static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA = 507026;          // ringbuffer no data
 static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP = 507027;               // kernel lookup error
 static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE = 507028;            // kernel register duplicate
 static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL = 507029;         // debug register failed
 static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL = 507030;       // debug unregister failed
 static const int32_t ACL_ERROR_RT_LABEL_CONTEXT = 507031;               // label not in current context
 static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT = 507032;             // program register num use out
 static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR = 507033;             // device setup error
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT = 507034;         // vector core timeout
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION = 507035;       // vector core exception
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036;  // vector core trap exception
 static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL = 507037;          // cdq alloc batch abnormal

 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899;    // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900;  // aicpu internal error
 static const int32_t ACL_ERROR_RT_SOCKET_CLOSE = 507901;          // hdc disconnect

 #ifdef __cplusplus
 }
 #endif

 #endif  // __INC_EXTERNEL_RT_ERROR_CODES_H__
--- a/inc/external/acl/ops/acl_cblas.h
+++ b/inc/external/acl/ops/acl_cblas.h
@@ -0,0 +1,334 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_OPS_ACL_CBLAS_H_
 #define INC_EXTERNAL_ACL_OPS_ACL_CBLAS_H_

 #include "acl/acl.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef enum aclTransType { ACL_TRANS_N, ACL_TRANS_T, ACL_TRANS_NZ, ACL_TRANS_NZ_T } aclTransType;

 typedef enum aclComputeType { ACL_COMPUTE_HIGH_PRECISION, ACL_COMPUTE_LOW_PRECISION } aclComputeType;

 /**
 * @ingroup AscendCL
 * @brief perform the matrix-vector multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param m [IN]           number of rows of matrix A
 * @param n [IN]           number of columns of matrix A
 * @param alpha [IN]       pointer to scalar used for multiplication.
 *                         of same type as dataTypeC
 * @param a [IN]           pointer to matrix A
 * @param lda [IN]         leading dimension used to store the matrix A
 * @param dataTypeA [IN]   datatype of matrix A
 * @param x [IN]           pointer to vector x
 * @param incx [IN]        stride between consecutive elements of vector x
 * @param dataTypeX [IN]   datatype of vector x
 * @param beta [IN]        pointer to scalar used for multiplication.
 *                         of same type as dataTypeC If beta == 0,
 *                         then y does not have to be a valid input
 * @param y [IN|OUT]       pointer to vector y
 * @param incy [IN]        stride between consecutive elements of vector y
 * @param dataTypeY [IN]   datatype of vector y
 * @param type [IN]        computation type
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasGemvEx(aclTransType transA, int m, int n, const void *alpha, const void *a, int lda,
                                           aclDataType dataTypeA, const void *x, int incx, aclDataType dataTypeX,
                                           const void *beta, void *y, int incy, aclDataType dataTypeY,
                                           aclComputeType type, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for performing the matrix-vector multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param m [IN]           number of rows of matrix A
 * @param n [IN]           number of columns of matrix A
 * @param dataTypeA [IN]   datatype of matrix A
 * @param dataTypeX [IN]   datatype of vector x
 * @param dataTypeY [IN]   datatype of vector y
 * @param type [IN]        computation type
 * @param handle [OUT]     pointer to the pointer to the handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasCreateHandleForGemvEx(aclTransType transA, int m, int n, aclDataType dataTypeA,
                                                          aclDataType dataTypeX, aclDataType dataTypeY,
                                                          aclComputeType type, aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief perform the matrix-vector multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param m [IN]           number of rows of matrix A
 * @param n [IN]           number of columns of matrix A
 * @param alpha [IN]       pointer to scalar used for multiplication
 * @param a [IN]           pointer to matrix A
 * @param lda [IN]         leading dimension used to store the matrix A
 * @param x [IN]           pointer to vector x
 * @param incx [IN]        stride between consecutive elements of vector x
 * @param beta [IN]        pointer to scalar used for multiplication.
 *                         If beta value == 0,
 *                         then y does not have to be a valid input
 * @param y [IN|OUT]       pointer to vector y
 * @param incy [IN]        stride between consecutive elements of vector y
 * @param type [IN]        computation type
 * @param stream [IN]      stream
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasHgemv(aclTransType transA, int m, int n, const aclFloat16 *alpha,
                                          const aclFloat16 *a, int lda, const aclFloat16 *x, int incx,
                                          const aclFloat16 *beta, aclFloat16 *y, int incy, aclComputeType type,
                                          aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for performing the matrix-vector multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param m [IN]           number of rows of matrix A
 * @param n [IN]           number of columns of matrix A
 * @param type [IN]        computation type
 * @param handle [OUT]     pointer to the pointer to the handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasCreateHandleForHgemv(aclTransType transA, int m, int n, aclComputeType type,
                                                         aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief perform the matrix-vector multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param m [IN]           number of rows of matrix A
 * @param n [IN]           number of columns of matrix A
 * @param alpha [IN]       pointer to scalar used for multiplication
 * @param a [IN]           pointer to matrix A
 * @param lda [IN]         leading dimension used to store the matrix A
 * @param x [IN]           pointer to vector x
 * @param incx [IN]        stride between consecutive elements of vector x
 * @param beta [IN]        pointer to scalar used for multiplication.
 *                         If beta value == 0,
 *                         then y does not have to be a valid input
 * @param y [IN|OUT]       pointer to vector y
 * @param incy [IN]        stride between consecutive elements of vector y
 * @param type [IN]        computation type
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasS8gemv(aclTransType transA, int m, int n, const int32_t *alpha, const int8_t *a,
                                           int lda, const int8_t *x, int incx, const int32_t *beta, int32_t *y,
                                           int incy, aclComputeType type, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for performing the matrix-vector multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param m [IN]           number of rows of matrix A
 * @param n [IN]           number of columns of matrix A
 * @param handle [OUT]     pointer to the pointer to the handle
 * @param type [IN]        computation type
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasCreateHandleForS8gemv(aclTransType transA, int m, int n, aclComputeType type,
                                                          aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief perform the matrix-matrix multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param transB [IN]      transpose type of matrix B
 * @param transC [IN]      transpose type of matrix C
 * @param m [IN]           number of rows of matrix A and matrix C
 * @param n [IN]           number of columns of matrix B and matrix C
 * @param k [IN]           number of columns of matrix A and rows of matrix B
 * @param alpha [IN]       pointer to scalar used for multiplication. of same type as dataTypeC
 * @param matrixA [IN]     pointer to matrix A
 * @param lda [IN]         leading dimension array used to store  matrix A
 * @param dataTypeA [IN]   datatype of matrix A
 * @param matrixB [IN]     pointer to matrix B
 * @param ldb [IN]         leading dimension array used to store  matrix B
 * @param dataTypeB [IN]   datatype of matrix B
 * @param beta [IN]        pointer to scalar used for multiplication.
 *                         of same type as dataTypeC If beta == 0,
 *                         then matrixC does not have to be a valid input
 * @param matrixC [IN|OUT] pointer to matrix C
 * @param ldc [IN]         leading dimension array used to store  matrix C
 * @param dataTypeC [IN]   datatype of matrix C
 * @param type [IN]        computation type
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasGemmEx(aclTransType transA, aclTransType transB, aclTransType transC, int m, int n,
                                           int k, const void *alpha, const void *matrixA, int lda,
                                           aclDataType dataTypeA, const void *matrixB, int ldb, aclDataType dataTypeB,
                                           const void *beta, void *matrixC, int ldc, aclDataType dataTypeC,
                                           aclComputeType type, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for performing the matrix-matrix multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param transB [IN]      transpose type of matrix B
 * @param transC [IN]      transpose type of matrix C
 * @param m [IN]           number of rows of matrix A and matrix C
 * @param n [IN]           number of columns of matrix B and matrix C
 * @param k [IN]           number of columns of matrix A and rows of matrix B
 * @param dataTypeA [IN]   datatype of matrix A
 * @param dataTypeB [IN]   datatype of matrix B
 * @param dataTypeC [IN]   datatype of matrix C
 * @param type [IN]        computation type
 * @param handle [OUT]     pointer to the pointer to the handle
 * @param type [IN]        computation type
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasCreateHandleForGemmEx(aclTransType transA, aclTransType transB, aclTransType transC,
                                                          int m, int n, int k, aclDataType dataTypeA,
                                                          aclDataType dataTypeB, aclDataType dataTypeC,
                                                          aclComputeType type, aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief perform the matrix-matrix multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param transB [IN]      transpose type of matrix B
 * @param transC [IN]      transpose type of matrix C
 * @param m [IN]           number of rows of matrix A and matrix C
 * @param n [IN]           number of columns of matrix B and matrix C
 * @param k [IN]           number of columns of matrix A and rows of matrix B
 * @param alpha [IN]       pointer to scalar used for multiplication
 * @param matrixA [IN]     pointer to matrix A
 * @param lda [IN]         leading dimension used to store the matrix A
 * @param matrixB [IN]     pointer to matrix B
 * @param ldb [IN]         leading dimension used to store the matrix B
 * @param beta [IN]        pointer to scalar used for multiplication.
 *                         If beta value == 0,
 *                         then matrixC does not have to be a valid input
 * @param matrixC [IN|OUT] pointer to matrix C
 * @param ldc [IN]         leading dimension used to store the matrix C
 * @param type [IN]        computation type
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasHgemm(aclTransType transA, aclTransType transB, aclTransType transC, int m, int n,
                                          int k, const aclFloat16 *alpha, const aclFloat16 *matrixA, int lda,
                                          const aclFloat16 *matrixB, int ldb, const aclFloat16 *beta,
                                          aclFloat16 *matrixC, int ldc, aclComputeType type, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for performing the matrix-matrix multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param transB [IN]      transpose type of matrix B
 * @param transC [IN]      transpose type of matrix C
 * @param m [IN]           number of rows of matrix A and matrix C
 * @param n [IN]           number of columns of matrix B and matrix C
 * @param k [IN]           number of columns of matrix A and rows of matrix B
 * @param type [IN]        computation type
 * @param handle [OUT]     pointer to the pointer to the handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasCreateHandleForHgemm(aclTransType transA, aclTransType transB, aclTransType transC,
                                                         int m, int n, int k, aclComputeType type,
                                                         aclopHandle **handle);

 /**
 * @ingroup AscendCL
 * @brief perform the matrix-matrix multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param transB [IN]      transpose type of matrix B
 * @param transC [IN]      transpose type of matrix C
 * @param m [IN]           number of rows of matrix A and matrix C
 * @param n [IN]           number of columns of matrix B and matrix C
 * @param k [IN]           number of columns of matrix A and rows of matrix B
 * @param alpha [IN]       pointer to scalar used for multiplication
 * @param matrixA [IN]     pointer to matrix A
 * @param lda [IN]         leading dimension used to store the matrix A
 * @param matrixB [IN]     pointer to matrix B
 * @param ldb [IN]         leading dimension used to store the matrix B
 * @param beta [IN]        pointer to scalar used for multiplication.
 *                         If beta value == 0,
 *                         then matrixC does not have to be a valid input
 * @param matrixC [IN|OUT] pointer to matrix C
 * @param ldc [IN]         leading dimension used to store the matrix C
 * @param type [IN]        computation type
 * @param stream [IN]      stream
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasS8gemm(aclTransType transA, aclTransType transB, aclTransType transC, int m, int n,
                                           int k, const int32_t *alpha, const int8_t *matrixA, int lda,
                                           const int8_t *matrixB, int ldb, const int32_t *beta, int32_t *matrixC,
                                           int ldc, aclComputeType type, aclrtStream stream);

 /**
 * @ingroup AscendCL
 * @brief create a handle for performing the matrix-matrix multiplication
 *
 * @param transA [IN]      transpose type of matrix A
 * @param transB [IN]      transpose type of matrix B
 * @param transC [IN]      transpose type of matrix C
 * @param m [IN]           number of rows of matrix A and matrix C
 * @param n [IN]           number of columns of matrix B and matrix C
 * @param k [IN]           number of columns of matrix A and rows of matrix B
 * @param type [IN]        computation type
 * @param handle [OUT]     pointer to the pointer to the handle
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclblasCreateHandleForS8gemm(aclTransType transA, aclTransType transB, aclTransType transC,
                                                          int m, int n, int k, aclComputeType type,
                                                          aclopHandle **handle);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_OPS_ACL_CBLAS_H_
--- a/inc/external/acl/ops/acl_dvpp.h
+++ b/inc/external/acl/ops/acl_dvpp.h
--- a/inc/external/acl/ops/acl_fv.h
+++ b/inc/external/acl/ops/acl_fv.h
@@ -0,0 +1,348 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_EXTERNAL_ACL_OPS_ACL_RETR_H_
 #define INC_EXTERNAL_ACL_OPS_ACL_RETR_H_

 #include "acl/acl.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 typedef struct aclfvInitPara aclfvInitPara;
 typedef struct aclfvFeatureInfo aclfvFeatureInfo;
 typedef struct aclfvRepoRange aclfvRepoRange;
 typedef struct aclfvQueryTable aclfvQueryTable;
 typedef struct aclfvSearchInput aclfvSearchInput;
 typedef struct aclfvSearchResult aclfvSearchResult;

 // search operation type
 enum aclfvSearchType {
  SEARCH_1_N,  // 1:N operation type
  SEARCH_N_M   // N:M operation type
 };

 /**
 * @ingroup AscendCL
 * @brief Create fv init param.
 *
 * @param fsNum [IN]  The feature num
 *
 * @retval null for failed.
 * @retval OtherValues success.
 */
 ACL_FUNC_VISIBILITY aclfvInitPara *aclfvCreateInitPara(uint64_t fsNum);

 /**
 * @ingroup AscendCL
 * @brief Destroy fv init param.
 *
 * @par Function
 * Can only destroy fv init param information created
 * through aclfvCreateInitPara interface.
 *
 * @param initPara [IN]   fv init param.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclfvCreateInitPara
 */
 ACL_FUNC_VISIBILITY aclError aclfvDestroyInitPara(aclfvInitPara *initPara);

 /**
 * @ingroup AscendCL
 * @brief set value for maxTopNumFor1N which in fv init param.
 *
 * @param initPara [IN|OUT]     fv init param.
 * @param maxTopNumFor1N [IN]   maxTopNumFor1N value for init param.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclfvSet1NTopNum(aclfvInitPara *initPara, uint32_t maxTopNumFor1N);

 /**
 * @ingroup AscendCL
 * @brief set value for maxTopNumForNM which in fv init param.
 *
 * @param initPara [IN|OUT]        fv init param.
 * @param maxTopNumForNM [IN]   maxTopNumForNM value for init param.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 */
 ACL_FUNC_VISIBILITY aclError aclfvSetNMTopNum(aclfvInitPara *initPara, uint32_t maxTopNumForNM);

 /**
 * @ingroup AscendCL
 * @brief Create fv feature info.
 *
 * @param id0 [IN]     The first level library id0
 * @param id1 [IN]     Secondary library id1
 * @param offset [IN]  The offset of the first feature in the library
 * @param featureLen [IN]       Single feature length
 * @param featureCount [IN]     Single feature count
 * @param featureData [IN]      Feature value list
 * @param featureDataLen [IN]   Feature value list length
 *
 * @retval null for failed.
 * @retval OtherValues success.
 */
 ACL_FUNC_VISIBILITY aclfvFeatureInfo *aclfvCreateFeatureInfo(uint32_t id0, uint32_t id1, uint32_t offset,
                                                             uint32_t featureLen, uint32_t featureCount,
                                                             uint8_t *featureData, uint32_t featureDataLen);

 /**
 * @ingroup AscendCL
 * @brief Destroy fv feature info.
 *
 * @par Function
 * Can only destroy fv feature info information created
 * through aclfvCreateFeatureInfo interface.
 *
 * @param featureInfo [IN]     fv feature info.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclfvCreateFeatureInfo
 */
 ACL_FUNC_VISIBILITY aclError aclfvDestroyFeatureInfo(aclfvFeatureInfo *featureInfo);

 /**
 * @ingroup AscendCL
 * @brief Create fv repo range.
 *
 * @param id0Min [IN]  id0 start value
 * @param id0Min [IN]  id0 max
 * @param id1Min [IN]  id0 start value
 * @param id1Max [IN]  id1 max
 *
 * @retval null for failed. OtherValues success
 */
 ACL_FUNC_VISIBILITY aclfvRepoRange *aclfvCreateRepoRange(uint32_t id0Min, uint32_t id0Max, uint32_t id1Min,
                                                         uint32_t id1Max);

 /**
 * @ingroup AscendCL
 * @brief Destroy fv repo range.
 *
 * @par Function
 * Can only destroy fv repo range information created
 * through aclfvCreateRepoRange interface.
 *
 * @param repoRange [IN]     fv repo range.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclfvCreateRepoRange
 */
 ACL_FUNC_VISIBILITY aclError aclfvDestroyRepoRange(aclfvRepoRange *repoRange);

 /**
 * @ingroup AscendCL
 * @brief Create query table.
 *
 * @param queryCnt [IN]   Number of tables, the maximum number is 6
 * @param tableLen [IN]   Single table length, table length is 32KB
 * @param tableData [IN]  Feature value list
 * @param tableDataLen [IN]   The length of memory requested by the featureData pointer
 *
 * @retval null for failed. OtherValues success
 */
 ACL_FUNC_VISIBILITY aclfvQueryTable *aclfvCreateQueryTable(uint32_t queryCnt, uint32_t tableLen, uint8_t *tableData,
                                                           uint32_t tableDataLen);

 /**
 * @ingroup AscendCL
 * @brief Destroy query table.
 *
 * @par Function
 * Can only destroy query table information created
 * through aclfvCreateQueryTable interface.
 *
 * @param queryTable [IN]     query table.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclfvCreateQueryTable
 */
 ACL_FUNC_VISIBILITY aclError aclfvDestroyQueryTable(aclfvQueryTable *queryTable);

 /**
 * @ingroup AscendCL
 * @brief Create search input.
 *
 * @param queryTable [IN]  query table
 * @param repoRange [IN]   query repo range
 * @param topk [IN]  query topk
 *
 * @retval null for failed. OtherValues success
 */
 ACL_FUNC_VISIBILITY aclfvSearchInput *aclfvCreateSearchInput(aclfvQueryTable *queryTable, aclfvRepoRange *repoRange,
                                                             uint32_t topk);

 /**
 * @ingroup AscendCL
 * @brief Destroy search input.
 *
 * @par Function
 * Can only destroy search input information created
 * through aclfvCreateSearchInput interface.
 *
 * @param searchInput [IN]     search input.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclfvCreateSearchInput
 */
 ACL_FUNC_VISIBILITY aclError aclfvDestroySearchInput(aclfvSearchInput *searchInput);

 /**
 * @ingroup AscendCL
 * @brief Create search result.
 *
 * @param queryCnt [IN]   Retrieve the number of features
 * @param resultNum [IN]  The number of search results for each feature, the number is queryCnt
 * @param resultNumDataLen [IN]  resultNum memory length
 * @param id0 [IN]  Level 1 library id0
 * @param id1 [IN]  Secondary library id1
 * @param resultOffset [IN]   The offset of the bottom library corresponding
 * to each feature retrieval result, total length topK * queryCnt
 * @param resultDistance [IN]  Distance, total length topK * queryCnt
 * @param dataLen [IN]  The memory size requested by
 * id0\id1\reslutOffset\resultDistance
 *
 * @retval null for failed. OtherValues success
 */
 ACL_FUNC_VISIBILITY aclfvSearchResult *aclfvCreateSearchResult(uint32_t queryCnt, uint32_t *resultNum,
                                                               uint32_t resultNumDataLen, uint32_t *id0, uint32_t *id1,
                                                               uint32_t *resultOffset, float *resultDistance,
                                                               uint32_t dataLen);

 /**
 * @ingroup AscendCL
 * @brief Destroy search result.
 *
 * @par Function
 * Can only destroy search result information created
 * through aclfvCreateSearchResult interface.
 *
 * @param searchResult [IN]     search result.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclfvCreateSearchResult
 */
 ACL_FUNC_VISIBILITY aclError aclfvDestroySearchResult(aclfvSearchResult *searchResult);

 /**
 * @ingroup AscendCL
 * @brief fv IP initialize.
 *
 * @param initPara [IN]     fv init param.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 */
 ACL_FUNC_VISIBILITY aclError aclfvInit(aclfvInitPara *initPara);

 /**
 * @ingroup AscendCL
 * @brief release fv resources.
 *
 * @par Function
 * Can only release fv resources created
 * through aclfvInit interface.
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 *
 * @see aclfvInit
 */
 ACL_FUNC_VISIBILITY aclError aclfvRelease();

 /**
 * @ingroup AscendCL
 * @brief fv repo add.
 *
 * @param type [IN]          repo add type
 * @param featureInfo [IN]   add feature information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 */
 ACL_FUNC_VISIBILITY aclError aclfvRepoAdd(aclfvSearchType type, aclfvFeatureInfo *featureInfo);

 /**
 * @ingroup AscendCL
 * @brief fv repo del.
 *
 * @param type [IN]       repo delete type
 * @param repoRange [IN]  repo range information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 */
 ACL_FUNC_VISIBILITY aclError aclfvRepoDel(aclfvSearchType type, aclfvRepoRange *repoRange);

 /**
 * @ingroup AscendCL
 * @brief fv accurate del.
 *
 * @param featureInfo [IN]   accurate delete feature information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 */
 ACL_FUNC_VISIBILITY aclError aclfvDel(aclfvFeatureInfo *featureInfo);

 /**
 * @ingroup AscendCL
 * @brief fv accurate modify.
 *
 * @param featureInfo [IN]  accurate modify feature information
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 */
 ACL_FUNC_VISIBILITY aclError aclfvModify(aclfvFeatureInfo *featureInfo);

 /**
 * @ingroup AscendCL
 * @brief fv search.
 *
 * @param type [IN]  search type
 * @param searchInput [IN]    search input
 * @param searchRst [OUT]     search result
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure.
 */
 ACL_FUNC_VISIBILITY aclError aclfvSearch(aclfvSearchType type, aclfvSearchInput *searchInput,
                                         aclfvSearchResult *searchRst);

 #ifdef __cplusplus
 }
 #endif

 #endif  // INC_EXTERNAL_ACL_OPS_ACL_RETR_H_
--- a/inc/external/hccl/hccl.h
+++ b/inc/external/hccl/hccl.h
@@ -0,0 +1,159 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /**
 * @file hccl.h
 * @brief HCCL API
 */

 #ifndef HCCL_H_
 #define HCCL_H_

 #include <hccl/hccl_types.h>
 #include <acl/acl.h>

 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus

 /**
 * @brief Initialize HCCL.
 *
 * @param clusterInfo A string identifying the cluster info file path, include file name.
 * @param rank A integer identifying the identify for the rank.
 * @param comm A pointer identifying the initialized communication resource.
 * @return HcclResult
 * @see HcclCommDestroy()
 */
 extern HcclResult HcclCommInitClusterInfo(const char *clusterInfo, uint32_t rank, HcclComm *comm);

 /**
 * @brief Get hccl root info.
 *
 * @param rootInfo A pointer identifying the hccl root info.
 * @return HcclResult
 */
 extern HcclResult HcclGetRootInfo(HcclRootInfo *rootInfo);

 /**
 * @brief Initialize HCCL with root info.
 *
 * @param nRanks A integer identifying the rank size of the cluster.
 * @param rootInfo A struct identifying the hccl root info.
 * @param rank A integer identifying the identify for the rank.
 * @param comm A pointer identifying the initialized communication resource.
 * @return HcclResult
 * @see HcclCommDestroy()
 */
 extern HcclResult HcclCommInitRootInfo(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclComm *comm);

 /**
 * @brief AllReduce operator.
 *
 * @param sendBuf A pointer identifying the input data address of the operator.
 * @param recvBuf A pointer identifying the output data address of the operator.
 * @param count An integer(u64) identifying the number of the output data.
 * @param dataType The data type of the operator, must be one of the following types: int8, int16, int32, float16,
 * float32.
 * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
 * @param comm A pointer identifying the communication resource based on.
 * @param stream A pointer identifying the stream information.
 * @return HcclResult
 */
 extern HcclResult HcclAllReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, HcclReduceOp op,
                                HcclComm comm, aclrtStream stream);

 /**
 * @brief Broadcast operator.
 *
 * @param buf A pointer identifying the data address of the operator.
 * @param count An integer(u64) identifying the number of the data.
 * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
 * @param root An integer(u32) identifying the the root rank in the operator.
 * @param comm A pointer identifying the communication resource based on
 * @param stream A pointer identifying the stream information.
 * @return HcclResult
 */
 extern HcclResult HcclBroadcast(void *buf, uint64_t count, HcclDataType dataType, uint32_t root, HcclComm comm,
                                aclrtStream stream);

 /**
 * @brief ReduceScatter operator.
 *
 * @param sendBuf A pointer identifying the input data address of the operator.
 * @param recvBuf A pointer identifying the output data address of the operator.
 * @param recvCount An integer(u64) identifying the number of the output data.
 * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
 * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
 * @param comm A pointer identifying the communication resource based on.
 * @param stream A pointer identifying the stream information.
 * @return HcclResult
 */
 extern HcclResult HcclReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, HcclDataType dataType,
                                    HcclReduceOp op, HcclComm comm, aclrtStream stream);

 /**
 * @brief AllGather operator.
 *
 * @param sendBuf A pointer identifying the input data address of the operator.
 * @param recvBuf A pointer identifying the output data address of the operator.
 * @param sendCount An integer(u64) identifying the number of the input data.
 * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
 * @param comm A pointer identifying the communication resource based on.
 * @param stream A pointer identifying the stream information.
 * @return HcclResult
 */
 extern HcclResult HcclAllGather(void *sendBuf, void *recvBuf, uint64_t sendCount, HcclDataType dataType, HcclComm comm,
                                aclrtStream stream);
 /**
 * @brief Get the rank size of this comm.
 *
 * @param comm A pointer identifying the communication resource based on.
 * @param rankSize  A pointer identifying the rank size.
 * @return HcclResult
 */
 extern HcclResult HcclGetRankSize(HcclComm comm, uint32_t *rankSize);

 /**
 * @brief Get the rank id of this comm.
 *
 * @param comm A pointer identifying the communication resource based on.
 * @param rankSize  A pointer identifying the rank id.
 * @return HcclResult
 */
 extern HcclResult HcclGetRankId(HcclComm comm, uint32_t *rank);
 /**
 * @brief Barrier operator.
 *
 * @param comm A pointer identifying the communication resource based on.
 * @param stream A pointer identifying the stream information.
 * @return HcclResult
 */
 extern HcclResult HcclBarrier(HcclComm comm, aclrtStream stream);

 /**
 * @brief Destroy HCCL comm
 *
 * @param comm A pointer identifying the communication resource targetting
 * @return HcclResult
 * @see HcclCommInitClusterInfo()
 */
 extern HcclResult HcclCommDestroy(HcclComm comm);

 #ifdef __cplusplus
 }
 #endif  // __cplusplus
 #endif  // HCCL_H_
--- a/inc/external/hccl/hccl_types.h
+++ b/inc/external/hccl/hccl_types.h
@@ -0,0 +1,101 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /**
 * @file hccl_types.h
 * @brief HCCL data type definition
 *
 */

 #ifndef HCCL_TYPES_H_
 #define HCCL_TYPES_H_

 #include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus

 /**
 * @brief HCCL functions return value definition
 */
 typedef enum {
  HCCL_SUCCESS = 0,              /**< success */
  HCCL_E_PARA = 1,               /**< parameter error */
  HCCL_E_PTR = 2,                /**< empty pointer */
  HCCL_E_MEMORY = 3,             /**< memory error */
  HCCL_E_INTERNAL = 4,           /**< internal error */
  HCCL_E_NOT_SUPPORT = 5,        /**< not support feature */
  HCCL_E_NOT_FOUND = 6,          /**< not found specific resource */
  HCCL_E_UNAVAIL = 7,            /**< resource unavailable */
  HCCL_E_SYSCALL = 8,            /**< call system interface error */
  HCCL_E_TIMEOUT = 9,            /**< timeout */
  HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */
  HCCL_E_TCP_CONNECT = 11,       /**< tcp connect fail */
  HCCL_E_ROCE_CONNECT = 12,      /**< roce connect fail */
  HCCL_E_TCP_TRANSFER = 13,      /**< tcp transfer fail */
  HCCL_E_ROCE_TRANSFER = 14,     /**< roce transfer fail */
  HCCL_E_RUNTIME = 15,           /**< call runtime api fail */
  HCCL_E_DRV = 16,               /**< call driver api fail */
  HCCL_E_PROFILING = 17,         /**< call profiling api fail */
  HCCL_E_CCE = 18,               /**< call cce api fail */
  HCCL_E_NETWORK = 19,           /**< call network api fail */
  HCCL_E_RESERVED                /**< reserved */
 } HcclResult;

 /**
 * @brief handle to HCCL communicator
 */
 typedef void *HcclComm;

 /**
 * @brief HCCL Reduction opperation
 */
 typedef enum {
  HCCL_REDUCE_SUM = 0,  /**< sum */
  HCCL_REDUCE_PROD = 1, /**< prod */
  HCCL_REDUCE_MAX = 2,  /**< max */
  HCCL_REDUCE_MIN = 3,  /**< min */
  HCCL_REDUCE_RESERVED  /**< reserved */
 } HcclReduceOp;

 /**
 * @brief HCCL data type
 */
 typedef enum {
  HCCL_DATA_TYPE_INT8 = 0,   /**< int8 */
  HCCL_DATA_TYPE_INT16 = 1,  /**< int16 */
  HCCL_DATA_TYPE_INT32 = 2,  /**< int32 */
  HCCL_DATA_TYPE_FP16 = 3,   /**< fp16 */
  HCCL_DATA_TYPE_FP32 = 4,   /**< fp32 */
  HCCL_DATA_TYPE_INT64 = 5,  /**< int64 */
  HCCL_DATA_TYPE_UINT64 = 6, /**< uint64 */
  HCCL_DATA_TYPE_RESERVED    /**< reserved */
 } HcclDataType;

 const uint32_t HCCL_ROOT_INFO_BYTES = 4108;  // 4108: root info length

 /**
 * @brief HCCL root info
 */
 typedef struct HcclRootInfoDef {
  char internal[HCCL_ROOT_INFO_BYTES];
 } HcclRootInfo;

 #ifdef __cplusplus
 }
 #endif  // __cplusplus
 #endif  // HCCL_TYPES_H_
--- a/inc/external/runtime/rt_error_codes.h
+++ b/inc/external/runtime/rt_error_codes.h
@@ -0,0 +1,109 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef __INC_EXTERNEL_RT_ERROR_CODES_H__
 #define __INC_EXTERNEL_RT_ERROR_CODES_H__

 #include <stddef.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 static const int32_t ACL_RT_SUCCESS = 0;  // success

 static const int32_t ACL_ERROR_RT_PARAM_INVALID = 107000;             // param invalid
 static const int32_t ACL_ERROR_RT_INVALID_DEVICEID = 107001;          // invalid device id
 static const int32_t ACL_ERROR_RT_CONTEXT_NULL = 107002;              // current context null
 static const int32_t ACL_ERROR_RT_STREAM_CONTEXT = 107003;            // stream not in current context
 static const int32_t ACL_ERROR_RT_MODEL_CONTEXT = 107004;             // model not in current context
 static const int32_t ACL_ERROR_RT_STREAM_MODEL = 107005;              // stream not in model
 static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID = 107006;   // event timestamp invalid
 static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL = 107007;  // event timestamp reversal
 static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED = 107008;            // memory address unaligned
 static const int32_t ACL_ERROR_RT_FILE_OPEN = 107009;                 // open file failed
 static const int32_t ACL_ERROR_RT_FILE_WRITE = 107010;                // write file failed
 static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE = 107011;          // error subscribe stream
 static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE = 107012;          // error subscribe thread
 static const int32_t ACL_ERROR_RT_GROUP_NOT_SET = 107013;             // group not set
 static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE = 107014;          // group not create
 static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG = 107015;          // callback not register to stream
 static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016;       // invalid memory type
 static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017;            // invalid handle
 static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018;       // invalid malloc type
 static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019;              // wait timeout

 static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000;  // feature not support
 static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001;    // memory allocation error
 static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002;          // memory free error
 static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003;     // aicore over flow
 static const int32_t ACL_ERROR_RT_NO_DEVICE = 207004;            // no device
 static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL = 207005;  // resource alloc fail
 static const int32_t ACL_ERROR_RT_NO_PERMISSION = 207006;        // no permission
 static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE = 207007;    // no event resource
 static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE = 207008;   // no stream resource
 static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE = 207009;   // no notify resource
 static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE = 207010;    // no model resource
 static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011;      // no cdq resource

 static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000;              // runtime internal error
 static const int32_t ACL_ERROR_RT_TS_ERROR = 507001;                    // ts internel error
 static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL = 507002;            // task full in stream
 static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY = 507003;           // task empty in stream
 static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE = 507004;         // stream not complete
 static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE = 507005;             // end of sequence
 static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE = 507006;          // event not complete
 static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR = 507007;       // context release error
 static const int32_t ACL_ERROR_RT_SOC_VERSION = 507008;                 // soc version error
 static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT = 507009;       // task type not support
 static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT = 507010;              // ts lost heartbeat
 static const int32_t ACL_ERROR_RT_MODEL_EXECUTE = 507011;               // model execute failed
 static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT = 507012;              // report timeout
 static const int32_t ACL_ERROR_RT_SYS_DMA = 507013;                     // sys dma error
 static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT = 507014;              // aicore timeout
 static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION = 507015;            // aicore exception
 static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION = 507016;       // aicore trap exception
 static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT = 507017;               // aicpu timeout
 static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION = 507018;             // aicpu exception
 static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR = 507019;      // aicpu datadump response error
 static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR = 507020;         // aicpu model operate response error
 static const int32_t ACL_ERROR_RT_PROFILING_ERROR = 507021;             // profiling error
 static const int32_t ACL_ERROR_RT_IPC_ERROR = 507022;                   // ipc error
 static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL = 507023;          // model abort normal
 static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING = 507024;        // kernel unregistering
 static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT = 507025;         // ringbuffer not init
 static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA = 507026;          // ringbuffer no data
 static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP = 507027;               // kernel lookup error
 static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE = 507028;            // kernel register duplicate
 static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL = 507029;         // debug register failed
 static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL = 507030;       // debug unregister failed
 static const int32_t ACL_ERROR_RT_LABEL_CONTEXT = 507031;               // label not in current context
 static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT = 507032;             // program register num use out
 static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR = 507033;             // device setup error
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT = 507034;         // vector core timeout
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION = 507035;       // vector core exception
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036;  // vector core trap exception
 static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL = 507037;          // cdq alloc batch abnormal

 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899;    // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900;  // aicpu internal error
 static const int32_t ACL_ERROR_RT_SOCKET_CLOSE = 507901;          // hdc disconnect

 #ifdef __cplusplus
 }
 #endif

 #endif  // __INC_EXTERNEL_RT_ERROR_CODES_H__
--- a/inc/framework/ge_runtime/task_info.h
+++ b/inc/framework/ge_runtime/task_info.h
@@ -271,13 +271,14 @@ class FusionEndTaskInfo : public TaskInfo {
 class HcclTaskInfo : public TaskInfo {
 public:
  HcclTaskInfo(const std::string &op_name, uint32_t stream_id, const std::string hccl_type, void *input_data_addr,
               void *output_data_addr, int64_t workspace_size, int64_t hccl_stream_num,
               void *output_data_addr, void *workspace_addr, int64_t workspace_size, int64_t hccl_stream_num,
               const std::vector<uint8_t> &private_def, void *ops_kernel_store, int32_t count, int64_t root_id,
               int64_t op_type, int64_t data_type, const std::string &group, bool dump_flag)
      : TaskInfo(op_name, stream_id, TaskInfoType::HCCL, dump_flag),
        hccl_type_(hccl_type),
        input_data_addr_(input_data_addr),
        output_data_addr_(output_data_addr),
        workspace_addr_(workspace_addr),
        workspace_size_(workspace_size),
        hccl_stream_num_(hccl_stream_num),
        private_def_(private_def),
@@ -292,6 +293,7 @@ class HcclTaskInfo : public TaskInfo {
  const std::string &hccl_type() const { return hccl_type_; }
  void *input_data_addr() const { return input_data_addr_; }
  void *output_data_addr() const { return output_data_addr_; }
  void *workspace_addr() const { return workspace_addr_; }
  int64_t workspace_size() const { return workspace_size_; }
  int64_t hccl_stream_num() const { return hccl_stream_num_; }
  const std::vector<uint8_t> &private_def() const { return private_def_; }
@@ -306,6 +308,7 @@ class HcclTaskInfo : public TaskInfo {
  std::string hccl_type_;
  void *input_data_addr_;
  void *output_data_addr_;
  void *workspace_addr_;
  int64_t workspace_size_;
  int64_t hccl_stream_num_;
  std::vector<uint8_t> private_def_;
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit a725349b65aef2940555af2ddb7b9461fbe0d5fd
 Subproject commit 211788997dcc9aa63527541a44d511388c06bce5
--- a/scripts/format_source_code.sh
+++ b/scripts/format_source_code.sh
@@ -0,0 +1,107 @@
 #!/bin/bash
 # Copyright 2019-2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 set -e

 CLANG_FORMAT=$(which clang-format) || (echo "Please install 'clang-format' tool first"; exit 1)

 version=$("${CLANG_FORMAT}" --version | sed -n "s/.*\ \([0-9]*\)\.[0-9]*\.[0-9]*.*/\1/p")
 if [[ "${version}" -lt "8" ]]; then
  echo "clang-format's version must be at least 8.0.0"
  exit 1
 fi

 CURRENT_PATH=$(pwd)
 SCRIPTS_PATH=$(dirname "$0")

 echo "CURRENT_PATH=${CURRENT_PATH}"
 echo "SCRIPTS_PATH=${SCRIPTS_PATH}"

 # print usage message
 function usage()
 {
  echo "Format the specified source files to conform the code style."
  echo "Usage:"
  echo "bash $0 [-a] [-c] [-l] [-h]"
  echo "e.g. $0 -c"
  echo ""
  echo "Options:"
  echo "    -a format of all files"
  echo "    -c format of the files changed compared to last commit, default case"
  echo "    -l format of the files changed in last commit"
  echo "    -h Print usage"
 }

 # check and set options
 function checkopts()
 {
  # init variable
  mode="changed"    # default format changed files

  # Process the options
  while getopts 'aclh' opt
  do
    case "${opt}" in
      a)
        mode="all"
        ;;
      c)
        mode="changed"
        ;;
      l)
        mode="lastcommit"
        ;;
      h)
        usage
        exit 0
        ;;
      *)
        echo "Unknown option ${opt}!"
        usage
        exit 1
    esac
  done
 }

 # init variable
 # check options
 checkopts "$@"

 # switch to project root path, which contains clang-format config file '.clang-format'
 cd "${SCRIPTS_PATH}/.." || exit 1

 FMT_FILE_LIST='__format_files_list__'

 if [[ "X${mode}" == "Xall" ]]; then
  find src -type f -name "*" | grep "\.h$\|\.cc$" > "${FMT_FILE_LIST}" || true
  find inc -type f -name "*" | grep "\.h$\|\.cc$" >> "${FMT_FILE_LIST}" || true
 elif [[ "X${mode}" == "Xchanged" ]]; then
  # --diff-filter=ACMRTUXB will ignore deleted files in commit
  git diff --diff-filter=ACMRTUXB --name-only | grep "^inc\|^src" | grep "\.h$\|\.cc$" >> "${FMT_FILE_LIST}" || true
 else  # "X${mode}" == "Xlastcommit"
  git diff --diff-filter=ACMRTUXB --name-only HEAD~ HEAD | grep "^inc\|^src" | grep "\.h$\|\.cc$" > "${FMT_FILE_LIST}" || true
 fi

 while read line; do
  if [ -f "${line}" ]; then
    ${CLANG_FORMAT} -i "${line}"
  fi
 done < "${FMT_FILE_LIST}"

 rm "${FMT_FILE_LIST}"
 cd "${CURRENT_PATH}" || exit 1

 echo "Specified cpp source files have been format successfully."
--- a/third_party/fwkacllib/inc/cce/taskdown_common.hpp
+++ b/third_party/fwkacllib/inc/cce/taskdown_common.hpp
@@ -27,15 +27,16 @@ namespace cce {
 #define CC_FUSION_OP_MAX 32

 typedef enum tagccKernelType {
  CCE_AI_CORE = 0, /* cce aicore */
  CCE_AI_CPU = 1,  /* cce aicpu */
  TE = 2,          /* te operator*/
  CUSTOMIZED = 3,  /* customized operator */
  TE_AI_CORE = 4,  /* te aicore operator*/
  TE_AI_CPU = 5,   /* te aicpu operator */
  AI_CPU = 6,      /* aicpu */
  CUST_AI_CPU = 7, /* custom aicpu*/
  INVALID = 8,     /* unknown kernel type */
  CCE_AI_CORE = 0,   /* cce aicore */
  CCE_AI_CPU = 1,    /* cce aicpu */
  TE = 2,            /* te operator*/
  CUSTOMIZED = 3,    /* customized operator */
  TE_AI_CORE = 4,    /* te aicore operator*/
  TE_AI_CPU = 5,     /* te aicpu operator */
  AI_CPU = 6,        /* aicpu */
  CUST_AI_CPU = 7,   /* custom aicpu*/
  HOST_CPU = 8,      /* host cpu */
  INVALID = 10000    /* unknown kernel type */
 } ccKernelType;

 typedef struct tagOpContext {
--- a/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
+++ b/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -124,27 +124,27 @@ struct HcomRemoteAccessAddrInfo {
 };

 struct HcomAllToAllVParams {
  void *sendbuf;
  void *sendcounts;
  void *sdispls;
  HcclDataType sendtype;
  void *recvbuf;
  void *recvcounts;
  void *rdispls;
  HcclDataType recvtype;
  const char *group;
    void *sendbuf;  // device mem
    void *sendcounts;  // device mem;  Type: uint_64
    void *sdispls;  // device mem;  Type: uint_64
    HcclDataType sendtype;
    void *recvbuf;  // device mem
    void *recvcounts;  // device mem;  Type: uint_64 
    void *rdispls;  // device mem;  Type: uint_64
    HcclDataType recvtype;
    const char *group;  // not used now
 };

 struct HcomGatherAllToAllVParams {
 void *addrInfo;
 void *addrInfoCountPerRank;
 void *recvbuf;
 void *recvcounts;
 void *rdispls;
 void *gatheredbuf;
 s32 addrLength;
 HcclDataType recvtype;
 const char *group;
    void *addrInfo;  // device mem;  contains host VA[uint_64]:  [addr, length, addr, length, addr, length, ...]
    void *addrInfoCountPerRank;  // device mem;  length: ranksize;  contains addrInfoCounts for every rank
    void *recvbuf;  // device mem
    void *recvcounts;  // device mem;  Type: uint_64
    void *rdispls;  // device mem;  Type: uint_64
    void *gatheredbuf;  // device mem
    s32 addrLength;
    HcclDataType recvtype;
    const char *group;  // not used now
 };

 #ifdef __cplusplus
--- a/third_party/fwkacllib/inc/hccl/hccl_types.h
+++ b/third_party/fwkacllib/inc/hccl/hccl_types.h
@@ -1,101 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /**
 * @file hccl_types.h
 * @brief HCCL data type definition 
 * 
 */
 
 #ifndef HCCL_TYPES_H_
 #define HCCL_TYPES_H_

 #include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus

 /**
 * @brief HCCL functions return value definition
 */
 typedef enum {
    HCCL_SUCCESS = 0,               /**< success */
    HCCL_E_PARA = 1,                /**< parameter error */
    HCCL_E_PTR = 2,                 /**< empty pointer */
    HCCL_E_MEMORY = 3,              /**< memory error */
    HCCL_E_INTERNAL = 4,            /**< internal error */
    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
    HCCL_E_SYSCALL = 8,             /**< call system interface error */
    HCCL_E_TIMEOUT = 9,             /**< timeout */
    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
    HCCL_E_DRV = 16,                /**< call driver api fail */
    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
    HCCL_E_CCE = 18,                /**< call cce api fail */
    HCCL_E_NETWORK = 19,            /**< call network api fail */
    HCCL_E_RESERVED                 /**< reserved */
 } HcclResult;

 /**
 * @brief handle to HCCL communicator
 */
 typedef void *HcclComm;

 /**
 * @brief HCCL Reduction opperation
 */
 typedef enum {
    HCCL_REDUCE_SUM = 0,    /**< sum */
    HCCL_REDUCE_PROD = 1,   /**< prod */
    HCCL_REDUCE_MAX = 2,    /**< max */
    HCCL_REDUCE_MIN = 3,    /**< min */
    HCCL_REDUCE_RESERVED    /**< reserved */
 } HcclReduceOp;

 /**
 * @brief HCCL data type
 */
 typedef enum {
    HCCL_DATA_TYPE_INT8 = 0,    /**< int8 */
    HCCL_DATA_TYPE_INT16 = 1,   /**< int16 */
    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
    HCCL_DATA_TYPE_FP16 = 3,    /**< fp16 */
    HCCL_DATA_TYPE_FP32 = 4,    /**< fp32 */
    HCCL_DATA_TYPE_INT64 = 5,    /**< int64 */
    HCCL_DATA_TYPE_UINT64 = 6,    /**< uint64 */
    HCCL_DATA_TYPE_RESERVED     /**< reserved */
 } HcclDataType;

 const uint32_t HCCL_ROOT_INFO_BYTES =  4108; // 4108: root info length

 /**
 * @brief HCCL root info
 */
 typedef struct HcclRootInfoDef {
    char internal[HCCL_ROOT_INFO_BYTES];
 } HcclRootInfo;

 #ifdef __cplusplus
 }
 #endif // __cplusplus
 #endif // HCCL_TYPES_H_
--- a/third_party/fwkacllib/inc/hccl/hcom.h
+++ b/third_party/fwkacllib/inc/hccl/hcom.h
@@ -164,8 +164,22 @@ HcclResult HcomExecEnqueueRemoteAccess(const std::string& remoteAccessType,
                                       const std::vector<HcomRemoteAccessAddrInfo>& addrInfos,
                                       std::function<void(HcclResult status)> callback);

 /**
 * @brief Put alltoallv communication operation into hcom executor.
 *
 * @param params information about alltoallv communication operation.
 * @param callback callback after collective communication operation.
 * @return HcclResult
 */
 HcclResult HcomExecEnqueueAllToAllV(HcomAllToAllVParams params, std::function<void(HcclResult status)> callback);

 /**
 * @brief Put agther alltoallv communication operation into hcom executor.
 *
 * @param params information about agther alltoallv communication operation.
 * @param callback callback after collective communication operation.
 * @return HcclResult
 */
 HcclResult HcomExecEnqueueGatherAllToAllV(HcomGatherAllToAllVParams params,
                                          std::function<void(HcclResult status)> callback);

--- a/third_party/fwkacllib/inc/mmpa/mmpa_api.h
+++ b/third_party/fwkacllib/inc/mmpa/mmpa_api.h
@@ -56,6 +56,7 @@
 #include <dirent.h>
 #include <getopt.h>
 #include <libgen.h>
 #include <malloc.h>

 #include <linux/types.h>
 #include <linux/hdreg.h>
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
@@ -550,6 +550,10 @@ MMPA_FUNC_VISIBILITY mmFileHandle mmShmOpen(const CHAR *name, INT32 oflag, mmMod
 MMPA_FUNC_VISIBILITY INT32 mmShmUnlink(const CHAR *name);
 MMPA_FUNC_VISIBILITY VOID *mmMmap(mmFd_t fd, mmSize_t size, mmOfft_t offset, mmFd_t *extra, INT32 prot, INT32 flags);
 MMPA_FUNC_VISIBILITY INT32 mmMunMap(VOID *data, mmSize_t size, mmFd_t *extra);

 MMPA_FUNC_VISIBILITY mmSize mmGetPageSize();
 MMPA_FUNC_VISIBILITY VOID *mmAlignMalloc(mmSize mallocSize, mmSize alignSize);
 MMPA_FUNC_VISIBILITY VOID mmAlignFree(VOID *addr);
 #define MMPA_DLL_API

 #ifdef __cplusplus
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
@@ -557,6 +557,10 @@ MMPA_FUNC_VISIBILITY mmFileHandle mmShmOpen(const CHAR *name, INT32 oflag, mmMod
 MMPA_FUNC_VISIBILITY INT32 mmShmUnlink(const CHAR *name);
 MMPA_FUNC_VISIBILITY VOID *mmMmap(mmFd_t fd, mmSize_t size, mmOfft_t offset, mmFd_t *extra, INT32 prot, INT32 flags);
 MMPA_FUNC_VISIBILITY INT32 mmMunMap(VOID *data, mmSize_t size, mmFd_t *extra);

 MMPA_FUNC_VISIBILITY mmSize mmGetPageSize();
 MMPA_FUNC_VISIBILITY VOID *mmAlignMalloc(mmSize mallocSize, mmSize alignSize);
 MMPA_FUNC_VISIBILITY VOID mmAlignFree(VOID *addr);
 #ifdef __cplusplus
 #if __cplusplus
 }
--- a/third_party/fwkacllib/inc/ops/aipp.h
+++ b/third_party/fwkacllib/inc/ops/aipp.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,6 +65,8 @@ in aipp config file, framework will auto add one input node to graph at last. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator AippData.
 *@par Restrictions:
 *Warning: This operator can be integrated only by configuring INSERT_OP_FILE of aclgrphBuildModel. Please do not use it directly.
 */
 REG_OP(AippData)
    .INPUT(data, TensorType::ALL())
--- a/third_party/fwkacllib/inc/ops/all_ops.h
+++ b/third_party/fwkacllib/inc/ops/all_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@
 #include "image_ops.h"
 #include "internal_ops.h"
 #include "linalg_ops.h"
 #include "list_ops.h"
 #include "logging_ops.h"
 #include "lookup_ops.h"
 #include "math_ops.h"
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -626,7 +626,7 @@ REG_OP(StopGradient)
 *x: A tensor. \n

 *@par Outputs:
 *y: A tensor. \n
 *y: A tensor with the same shape and contents as input. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Identity.
@@ -666,7 +666,7 @@ REG_OP(IdentityN)
 *@li axis: The dimension index at which to expand. \n

 *@par Outputs:
 *y: A tensor. \n
 *y: A tensor with the same data as input, with an additional dimension inserted at the index specified by axis. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ExpandDims.
@@ -713,7 +713,7 @@ REG_OP(Unsqueeze)
 *@par Outputs:
 *y: A tensor. \n

 *@par Attention:
 *@attention Constraints:
 *This operator cannot be directly called by the acllopExecute API. \n

 *@par Third-party framework compatibility
@@ -1153,6 +1153,102 @@ REG_OP(EditDistance)
    .OUTPUT(output, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(EditDistance)

 /**
 * @brief sort_v2.

 * @par Inputs:
 * @li x: An ND tensor of type float16.

 * @par Attributes:

 * @li axis: An optional int. The dimension to sort along. This value defaults to -1.
 * @li descending: An optional bool. Controls the sorting order (ascending or descending). This value defaults to False.

 * @par Outputs:
 * @li y: An ND tensor of type float16.

 * @attention Constraints:
 * @li Axis should select the last dim.
 * @li When the sorting data is less than 150K, it is recommended to use this tbe ops,
 and the descending performance is better than the ascending.
 * @li The upper limit of data on Ascend910 is 2000K.
 */
 REG_OP(SortV2)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .ATTR(axis, Int, -1)
    .ATTR(descending, Bool, false)
    .OP_END_FACTORY_REG(SortV2)

 /**
 * @brief Expand the input tensor to a compatible shape. \n

 * @par Inputs:
 * One inputs, including:
 * @li x: A Tensor. Must be one of the following types:
 *     float16, float32, int32, int8 ,uint8. \n
 * @li shape: A Tensor to specify the shape that the input tensor expanded to. \n

 * @par Outputs:
 * @li y: A Tensor. Has the same type as "x", and the shape specified by input and attr shape \n

 * @par Third-party framework compatibility
 * Compatible with the ONNX operator Expand.
 */

 REG_OP(Expand)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .INPUT(shape, TensorType({DT_INT16, DT_INT32, DT_INT64}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .OP_END_FACTORY_REG(Expand)

 /**
 *@Returns a tensor containing the indices of all non-zero elements of input. \n

 *@par Inputs:
 *@li x: A Tensor. Must be one of the following types: float16, float32, int32, int64.

 *@par Attributes:
 * transpose: the output tensor will be transposed if true. \n

 *@par Outputs:
 * y: A Tensor. Has the same type as "x" . \n

 *@par Third-party framework compatibility
 *Compatible with the PyTorch operator NonZero.
 */

 REG_OP(NonZero)
    .INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \
              DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL}))
    .OUTPUT(y, TensorType({DT_INT64}))
    .ATTR(transpose, Bool, false)
    .OP_END_FACTORY_REG(NonZero)

 /**
 * @brief Expand the input tensor to a compatible shape. \n

 * @par Inputs:
 * One inputs, including:
 * @li x: A Tensor. Must be one of the following types:
 *     float16, float32, int32, int8 ,uint8. \n

 * @par Attributes:
 * @li shape: A required listInt to specify the shape that the input tensor expanded to. \n


 * @par Outputs:
 * @li y: A Tensor. Has the same type as "x", and the shape specified by input and attr shape \n

 * @par Third-party framework compatibility
 * Compatible with the ONNX operator Expand.
 */

 REG_OP(ExpandD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .REQUIRED_ATTR(shape, ListInt)
    .OP_END_FACTORY_REG(ExpandD)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_ARRAY_OPS_H_
--- a/third_party/fwkacllib/inc/ops/audio_ops.h
+++ b/third_party/fwkacllib/inc/ops/audio_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/avg_pool_1d_ops.h
+++ b/third_party/fwkacllib/inc/ops/avg_pool_1d_ops.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /*!
 * \file avg_pool_1d_ops.h
 * \brief
 */
 #ifndef OPS_BUILT_IN_OP_PROTO_INC_AVGPOOL1DOPS_H_
 #define OPS_BUILT_IN_OP_PROTO_INC_AVGPOOL1DOPS_H_
 #include "graph/operator_reg.h"

 namespace ge {
 /**
 *@brief Generate an auxiliary matrix .  \n

 *@par Inputs:
 * @li x: A tensor. Must be one of the following types:uint8, int8,int16, int32,
 int64, float16, float, double.The format must be NHWC NCHW NC1HWC0.

 *@par Attributes:
 *@li ksize: Kernel size. Input type is int.
 *@li strides: Input type is int.
 *@li pads: Input type is listInt .
 *@li ceil_mode: Bool, default value is false.
 *@li count_include_pad: Bool, default value is false.  \n

 *@par Outputs:
 *y_tensor: A  tensor with the same types as "x" .  \n
 *@par Third-party framework compatibility

 *Compatible with the TensorFlow operator Unbatch.
 */
 REG_OP(AvgPool1DAvgMatrix)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT8,
                          DT_INT32, DT_INT64, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT8,
                           DT_INT32, DT_INT64, DT_DOUBLE}))
    .REQUIRED_ATTR(ksize, Int)
    .REQUIRED_ATTR(strides, Int)
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(ceil_mode, Bool, false)
    .ATTR(count_include_pad, Bool, false)
    .OP_END_FACTORY_REG(AvgPool1DAvgMatrix)
 }
 #endif
--- a/third_party/fwkacllib/inc/ops/batch_ops.h
+++ b/third_party/fwkacllib/inc/ops/batch_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,10 +64,10 @@ the same types as "x_tensors" .  It's a dynamic output.  \n
 REG_OP(Batch)
  .DYNAMIC_INPUT(x_tensors, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, \
      DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE}))
  .OUTPUT(y_index, TensorType({ DT_INT64 }))
  .OUTPUT(y_id, TensorType({ DT_INT64 }))
  .DYNAMIC_OUTPUT(y_tensors, TensorType({DT_INT8, DT_UINT8, DT_INT16, \
      DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT, DT_FLOAT16, DT_DOUBLE, DT_BOOL}))
  .OUTPUT(y_index, TensorType({ DT_INT64 }))
  .OUTPUT(y_id, TensorType({ DT_INT64 }))
  .REQUIRED_ATTR(num_batch_threads, Int)
  .REQUIRED_ATTR(max_batch_size, Int)
  .ATTR(max_enqueued_batches, Int, 10)
@@ -107,11 +107,13 @@ across multiple sessions .   \n

 REG_OP(Unbatch)
  .INPUT(x_tensor, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE, DT_FLOAT16, \
      DT_COMPLEX64, DT_COMPLEX128}))
  .INPUT(index, TensorType({DT_INT64}))
  .INPUT(id, TensorType({DT_INT64}))
  .OUTPUT(y_tensor, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE, DT_FLOAT16, \
      DT_COMPLEX64, DT_COMPLEX128}))
  .REQUIRED_ATTR(timeout_micros, Int)
  .ATTR(container, String, "")
  .ATTR(shared_name, String, "")
@@ -146,13 +148,16 @@ across multiple sessions .   \n

 REG_OP(UnbatchGrad)
  .INPUT(x_input, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE, DT_FLOAT16, \
      DT_COMPLEX64, DT_COMPLEX128}))
  .INPUT(index, TensorType({DT_INT64}))
  .INPUT(grad, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE, DT_FLOAT16, \
      DT_COMPLEX64, DT_COMPLEX128}))
  .INPUT(id, TensorType({DT_INT64}))
  .OUTPUT(y_grad, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
      DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE, DT_FLOAT16, \
      DT_COMPLEX64, DT_COMPLEX128}))
  .ATTR(container, String, "")
  .ATTR(shared_name, String, "")
  .OP_END_FACTORY_REG(UnbatchGrad)
--- a/third_party/fwkacllib/inc/ops/bitwise_ops.h
+++ b/third_party/fwkacllib/inc/ops/bitwise_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,6 +25,35 @@

 namespace ge {

 /**
 *@brief Element-wise computes the bitwise left-shift of x and y . \n

 *@par Inputs:
 *Input "x" is a k-dimensional tensor. Inputs "num_lower" and "num_upper"
 are 0D scalars.
 * @li x: A Tensor. Must be one of the following types: int8, int16, int32,
 int64, uint8, uint16, uint32, uint64.
 * @li y: A Tensor. Has the same type as "x".  \n

 *@par Outputs:
 * z: A Tensor. Has the same type as "x".  \n

 *@attention Constraints:
 *Unique runs on the Ascend AI CPU, which delivers poor performance.  \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator LeftShift.
 */

 REG_OP(LeftShift)
    .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, \
           DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64}))
    .INPUT(y, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, \
           DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64}))
    .OUTPUT(z, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, \
            DT_UINT8, DT_UINT16, DT_UINT32, DT_UINT64}))
    .OP_END_FACTORY_REG(LeftShift)

 /**
 *@brief Element-wise computes the bitwise right-shift of x and y . \n

--- a/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
+++ b/third_party/fwkacllib/inc/ops/boosted_trees_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
+++ b/third_party/fwkacllib/inc/ops/candidate_sampling_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/condtake_ops.h
+++ b/third_party/fwkacllib/inc/ops/condtake_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/control_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/control_flow_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ REG_OP(RefMerge)
 *       Otherwise, the data is forwarded to "output_false" . \n

 *@par Inputs:
 *@li data: The tensor to be forwarded. \ n
 *@li data: The tensor to be forwarded. \n
 *          Must be one of the following types: float16, float32, float64,
 *          int8, int16, int32, int64, uint8, uint16, uint32, uint64, bool.
 *@li pred: A boolean scalar. The output port that will receive data . \n
@@ -387,12 +387,12 @@ REG_OP(ControlTrigger)

 *@par Inputs:
 * Three inputs, including:
 *@li x: One dimensional tensore of type int32, specifying queried shape, max size is 8.
 *@li data_seq: One dimensional tensore of type int32, specifying the mapped table is queried.
 *@li level_index: One dimensional tensore of type int32, specifying secondary index. \n
 *@li x: One dimensional tensor of type int32, specifying queried shape, max size is 128.
 *@li data_seq: One dimensional tensor of type int32, specifying the mapped table is queried.
 *@li level_index: One dimensional tensor of type int32, specifying secondary index. \n

 *@par Outputs:
 *@li y: A Tensor with shape [batch, 8], of type int32, specifying index of shape in the map.
 *@li y: A Tensor with shape [8], of type int32, specifying index of shape in the map.
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
--- a/third_party/fwkacllib/inc/ops/correlation.h
+++ b/third_party/fwkacllib/inc/ops/correlation.h
@@ -0,0 +1,52 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /*!
 * \file correlation.h
 * \brief
 */
 #ifndef GE_OP_CORRELATION_OPS_H
 #define GE_OP_CORRELATION_OPS_H

 #include "graph/operator_reg.h"

 namespace ge {
 /**
 *@brief Computes a 2D Correlation given 4D "x" and "filter" tensors.
 *
 *@par Inputs:
 * @li filter: A 4D tensor of filters.
 * @li x: A 4D tensor of input images, batch number must equal to batch
 * number of "filter", and channel must equal to channel of "filter".
 *
 *@par Attributes:
 * @li groups: set correlation mode, must be 1 or channel.
 *
 *@par Outputs:
 *y: A Tensor. Has the same type as "x".

 *@par Third-party framework compatibility
 * Compatible with caffe correlation custom operator.
 */
 REG_OP(Correlation)
    .INPUT(filter, TensorType({DT_FLOAT16, DT_INT8}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32}))
    .ATTR(groups, Int, 1)
    .OP_END_FACTORY_REG(Correlation)
 }  // namespace ge

 #endif  // GE_OP_NN_CALCULATION_OPS_H
--- a/third_party/fwkacllib/inc/ops/ctc_ops.h
+++ b/third_party/fwkacllib/inc/ops/ctc_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -137,6 +137,87 @@ REG_OP(CTCBeamSearchDecoder)
    .OUTPUT(log_probability, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(CTCBeamSearchDecoder)

 /**
 *@brief The Connectionist Temporal Classification loss.

 *@par Inputs:
 *@li log_probs: Tensor of size (T, N, C), where T =input length, N =batch size,
                and C = number of classes (including blank).
                It represent the logarithmized probabilities of the outputs.
 *@li targets: Tensor of size (N, S), where S= max target length.
             It represent the target sequences.
 *@li input_lengths: Tuple or tensor of size (N). It represent the lengths of the inputs.
 *@li target_lengths: Tuple or tensor of size (N). It represent lengths of the targets.

 *@par Outputs:
 *@li neg_log_likelihood: A loss value which is differentiable with respect to each input node.
 *@li log_alpha: The probability of possible trace of input to target.

 *@par Attributes:
 *@li blank : Blank label. Default 0.
 *@li reduction: Specifies the reduction to apply to the output. Default: 'mean'.
 *@li zero_infinity : Whether to zero infinite losses and the associated gradients.

 *@par Third-party framework compatibility
 * Compatible with Pytorch CTCLoss operator.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(CTCLossV2)
    .INPUT(log_probs, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(targets, TensorType({DT_INT32, DT_INT64}))
    .INPUT(input_lengths, TensorType({DT_INT32, DT_INT64}))
    .INPUT(target_lengths, TensorType({DT_INT32, DT_INT64}))
    .OUTPUT(neg_log_likelihood, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(log_alpha, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(blank, Int, 0)
    .ATTR(reduction, String, "mean")
    .ATTR(zero_infinity, Bool, false)
    .OP_END_FACTORY_REG(CTCLossV2)

 /**
 *@brief The Connectionist Temporal Classification loss grad.

 *@par Inputs:
 *@li grad_out: Gradient renewal coefficient. Tensor of size (N), where N = batch size.
 *@li log_probs: Tensor of size (T, N, C), where T =input length, N =batch size,
                and C = number of classes (including blank).
                It represent the logarithmized probabilities of the outputs.
 *@li targets: Tensor of size (N, S), where S= max target length.
             It represent the target sequences.
 *@li input_lengths: Tuple or tensor of size (N). It represent the lengths of the inputs.
 *@li target_lengths: Tuple or tensor of size (N). It represent lengths of the targets.
 *@li neg_log_likelihood: A loss value which is differentiable with respect to each input node.
 *@li log_alpha: The probability of possible trace of input to target.

 *@par Outputs:
 *@li grad: Tensor of size (T, N, C), The grad of Connectionist Temporal Classification loss.

 *@par Attributes:
 *@li blank : Blank label. Default 0.
 *@li reduction: Specifies the reduction to apply to the output. Default: 'mean'.
 *@li zero_infinity : Whether to zero infinite losses and the associated gradients.

 *@par Third-party framework compatibility
 * Compatible with Pytorch CTCLoss operator.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(CTCLossV2Grad)
    .INPUT(grad_out, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(log_probs, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(targets, TensorType({DT_INT32, DT_INT64}))
    .INPUT(input_lengths, TensorType({DT_INT32, DT_INT64}))
    .INPUT(target_lengths, TensorType({DT_INT32, DT_INT64}))
    .INPUT(neg_log_likelihood, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(log_alpha, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(grad, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(blank, Int, 0)
    .ATTR(reduction, String, "mean")
    .ATTR(zero_infinity, Bool, false)
    .OP_END_FACTORY_REG(CTCLossV2Grad)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_
--- a/third_party/fwkacllib/inc/ops/data_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -908,7 +908,7 @@ REG_OP(TensorArray)
    .OUTPUT(handle, TensorType({DT_RESOURCE}))
    .OUTPUT(flow, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(dtype, Type)
    .ATTR(element_shape, ListInt, ge::UNKNOWN_SHAPE)
    .ATTR(element_shape, ListInt, ge::UNKNOWN_RANK)
    .ATTR(dynamic_size, Bool, false)
    .ATTR(clear_after_read, Bool, true)
    .ATTR(identical_element_shapes, Bool, false)
@@ -963,7 +963,7 @@ REG_OP(TensorArrayConcat)
        DT_QUINT8, DT_QINT32}))
    .OUTPUT(lengths, TensorType({DT_INT64}))
    .REQUIRED_ATTR(dtype, Type)
    .ATTR(element_shape_except0, ListInt, ge::UNKNOWN_SHAPE)
    .ATTR(element_shape_except0, ListInt, ge::UNKNOWN_RANK)
    .OP_END_FACTORY_REG(TensorArrayConcat)

 /**
@@ -999,7 +999,7 @@ REG_OP(TensorArrayGather)
        DT_STRING, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8,
        DT_QUINT8, DT_QINT32}))
    .REQUIRED_ATTR(dtype, Type)
    .ATTR(element_shape, ListInt, ge::UNKNOWN_SHAPE)
    .ATTR(element_shape, ListInt, ge::UNKNOWN_RANK)
    .OP_END_FACTORY_REG(TensorArrayGather)

 /**
@@ -1430,6 +1430,24 @@ REG_OP(OrderedMapClear)
    .ATTR(shared_name, String, "")
    .OP_END_FACTORY_REG(OrderedMapClear)

 /**
 *@brief FakeQueue, support tf api FixedLengthRecordReader. \n

 *@par Inputs:
 *Including:
 * @li resource: A Tensor of type DT_RESOURCE.

 *@par Outputs:
 *handle: A Tensor of type DT_STRING ref. \n

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator FakeQueue.
 */
 REG_OP(FakeQueue)
    .INPUT(resource, TensorType({DT_RESOURCE}))
    .OUTPUT(handle, TensorType({DT_STRING}))
    .OP_END_FACTORY_REG(FakeQueue)

 /**
 *@brief Returns the number of incomplete elements in the underlying container. \n

@@ -2258,6 +2276,7 @@ REG_OP(LruCache)
  .ATTR(shared_name, String, "LruCache")
  .ATTR(cache_size, Int, 100000)
  .ATTR(load_factor, Float, 1)
  .REQUIRED_ATTR(dtype, Type)
  .OP_END_FACTORY_REG(LruCache)

 /**
@@ -2277,9 +2296,9 @@ REG_OP(CacheAdd)
  .INPUT(cache, TensorType({DT_RESOURCE}))
  .INPUT(ids, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OUTPUT(swap_in_id, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OUTPUT(swap_in_idx, TensorType({DT_INT64}))
  .OUTPUT(swap_in_idx, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OUTPUT(swap_out_id, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OUTPUT(swap_out_idx, TensorType({DT_INT64}))
  .OUTPUT(swap_out_idx, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OP_END_FACTORY_REG(CacheAdd)

 /**
@@ -2295,9 +2314,65 @@ REG_OP(CacheAdd)
 REG_OP(CacheRemoteIndexToLocal)
  .INPUT(cache, TensorType({DT_RESOURCE}))
  .INPUT(ids, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OUTPUT(local_idx, TensorType({DT_INT64}))
  .OUTPUT(local_idx, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .OP_END_FACTORY_REG(CacheRemoteIndexToLocal)

 /**
 *@brief CacheAllToLocalIndex, get id in cache
 *@par Inputs:
 *cache: resource data
 *local_idx: id in cache.
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(CacheAllIndexToLocal)
  .INPUT(cache, TensorType({DT_RESOURCE}))
  .OUTPUT(local_idx, TensorType({DT_INT64, DT_INT32, DT_UINT64, DT_UINT32}))
  .REQUIRED_ATTR(dtype, Type)
  .OP_END_FACTORY_REG(CacheAllIndexToLocal)

 /**
 *@brief DynamicGetNext, dynamic get next data
 *@par Inputs:
 *x: the iterator, all types are available
 *@par Outputs:
 *y: the date in iterator, all types are available
 *@par Attributes:
 *output_types: types of all outputs
 *output_shapes: shapes of all outputs
 *_dynamic_graph_execute_mode: dynamic graph execution mode,
 value is one of lazy_recompile and dynamic_execute
 *_getnext_inputs_shape_range: shape ranges of outputs,
 it works where _dynamic_graph_execute_mode is dynamic_execute
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DynamicGetNext)
  .INPUT(x, TensorType::ALL())
  .DYNAMIC_OUTPUT(y, TensorType::ALL())
  .ATTR(output_types, ListType, {})
  .ATTR(output_shapes, ListListInt, {{}, {}})
  .ATTR(_dynamic_graph_execute_mode, String, "lazy_recompile")
  .ATTR(_getnext_inputs_shape_range, String, "")
  .OP_END_FACTORY_REG(DynamicGetNext)

 /**
 *@brief AdpGetNext
 *@par Outputs:
 *y: the data in iterator, all types are available
 *@par Attributes:
 *output_types: types of all outputs
 *output_shapes: shapes of all outputs
 *queue_name: cdqm queue name
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(AdpGetNext)
  .DYNAMIC_OUTPUT(y, TensorType::ALL())
  .ATTR(output_types, ListType, {})
  .ATTR(output_shapes, ListListInt, {{}, {}})
  .ATTR(queue_name, String, "")
  .OP_END_FACTORY_REG(AdpGetNext)
 }   // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_DATA_FLOW_OPS_H_
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,10 +28,13 @@ namespace ge {

 *@par Inputs:
 *Dynamic inputs, including:
 * @li x: A list of Tensor objects, each with same shape and type. The supported types are:
 *x: A list of Tensor objects, each with same shape and type. The supported types are:
 *   float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *   qint8, quint8, qint32, uint16, complex128, uint32, uint64. It's a dynamic input. \n

 *@par Attributes:
 *N: An required attribute of type int32, means nums of inputs. \n

 *@par Outputs:
 *y: A Tensor. Has the same shape and type as the elements of "x". \n

@@ -122,7 +125,8 @@ REG_OP(MinimumGrad)
 *@par Inputs:
 *One input:
 *x:A Tensor. Must be one of the following types: bool, float16, float, int8, int32, uint32, uint8,
   int64, uint64, int16, uint16, double, complex64, complex128, qint8, quint8, qint16, quint16, qint32. \n
   int64, uint64, int16, uint16, double, complex64, complex128, qint8, quint8, qint16, quint16, qint32.
   For float32 type, the actual calculation on the chip is based on float16.  \n

 *@par Attributes:
 *dst_type: An required attribute of type int32, specifying the dst data type. \n
@@ -142,6 +146,8 @@ REG_OP(Cast)

 /**
 *@brief Returns the truth value of (x1 >= x2) element-wise. \n
 *when input is int32 and (x2 - x1) > 2**31 or < -2**31
 *aicore accuracy is not guaranteed \n

 *@par Inputs:
 *Two inputs, including:
@@ -163,6 +169,8 @@ REG_OP(GreaterEqual)

 /**
 *@brief Returns the truth value of (x1 < x2) element-wise. \n
 *when input is int32 and (x2 - x1) > 2**31 or < -2**31
 *aicore accuracy is not guaranteed \n

 *@par Inputs:
 *Two inputs, including:
@@ -322,8 +330,8 @@ REG_OP(Sub)
 *@brief computes the absolute value of a tensor. \n

 *@par Inputs:
 *One inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32, double, int32, int64. \n
 *One input, including: \n
 *x: A Tensor. Must be one of the following types: float16, float32, double, int32, int64. \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x". \n
@@ -563,6 +571,8 @@ REG_OP(InvGrad)

 /**
 *@brief: Returns the truth value of (x <= y) element-wise. \n
 *when input is int32 and (x2 - x1) > 2**31 or < -2**31
 *aicore accuracy is not guaranteed \n

 *@par Inputs:
 * Two inputs, including:
@@ -611,6 +621,15 @@ REG_OP(Log1p)

 *@par Outputs:
 *y: A Tensor. Has the same type as "x1".

 *@attention Constraints:
 *@li x2: The input data does not support 0
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
 *requirement of double thousandths in the mini form
 *@li Due to different architectures, the calculation results of this operator 
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Mod.
 */
@@ -1020,7 +1039,7 @@ REG_OP(BesselI1e)
 * y = log_base(shift + scale * x), with "base" > 0. \n

 * @par Inputs:
 * @li x: A Tensor of type complex64, complex128, float16, float32 or double. \n
 * x: A Tensor of type complex64, complex128, float16, float32 or double. \n

 * @par Attributes:
 * @li base: An optional float32, specifying the base "e". Defaults to "-1.0"
@@ -1065,7 +1084,7 @@ REG_OP(Log)
 * uint8, int8, uint16, int16, int32, int64, complex64, complex128. \n

 * @attention Constraints:
 * @li "x1" and "x2" have incompatible shapes or types. \n
 * "x1" and "x2" have incompatible shapes or types. \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator Multiply.
@@ -1451,6 +1470,8 @@ REG_OP(ReciprocalGrad)

 /**
 *@brief Returns the truth value of (x1 > x2) element-wise. \n
 *when input is int32 and (x2 - x1) > 2**31 or < -2**31
 *aicore accuracy is not guaranteed \n

 *@par Inputs:
 *@li x1: A Tensor of type float16, float32, double, int64, int32, int16, int8,
@@ -2042,6 +2063,15 @@ REG_OP(FloorDiv)
 *
 *@par Outputs:
 *y: Result remainder.

 *@attention Constraints:
 *@li x2: The input data does not support 0
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
 *requirement of double thousandths in the mini form
 *@li Due to different architectures, the calculation results of this operator 
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator FloorMod.
 */
@@ -2168,6 +2198,14 @@ REG_OP(Tan)
 *@par Outputs:
 *y: A Tensor. Has the same type as "x1". \n

 *@attention Constraints:
 *@li x2: The input data does not support 0
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
 *requirement of double thousandths in the mini form
 *@li Due to different architectures, the calculation results of this operator 
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8

 *@par Third-party framework compatibility
 *@li Compatible with the TensorFlow operator TruncateMod.
 */
@@ -2424,6 +2462,25 @@ REG_OP(Eltwise)
    .ATTR(coeff, ListFloat, {})
    .OP_END_FACTORY_REG(Eltwise)

 /**
 *@brief Computes the inverse error function of each element of input. \n

 *@par Inputs:
 *One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n

 *@par Outputs:
 *y: A Tensor with the same type and shape of input_x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Erfinv. \n
 */
 REG_OP(Erfinv)
    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(output_y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(Erfinv)

 /**
 *@brief Computes element-wise population count. \n

@@ -2829,9 +2886,9 @@ REG_OP(AdamApplyOneAssign)
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(LambApplyOptimizerAssign)
    .INPUT(input0, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(grad, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(inputv, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(inputm, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(mul0_x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(mul1_x, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -2842,6 +2899,8 @@ REG_OP(LambApplyOptimizerAssign)
    .INPUT(do_use_weight, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(weight_decay_rate, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(output0, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(inputv, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(inputm, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OP_END_FACTORY_REG(LambApplyOptimizerAssign)

 /**
@@ -2873,7 +2932,8 @@ REG_OP(LambApplyWeightAssign)
    .INPUT(input1, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input2, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input3, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input4, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(input_param, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(input_param, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OP_END_FACTORY_REG(LambApplyWeightAssign)

 /**
@@ -3183,12 +3243,14 @@ REG_OP(Fills)
 *@brief Add tensor with scale. \n

 *@par Inputs:
 *Five inputs, including:
 * @li x1: A Tensor. Must be one of the following types:int32,int16, float16, float32.
 * @li x2: A scale. Must be float. \n
 *One input, including: \n
 *x: A Tensor. Must be one of the following types:int32,int16, float16, float32. \n

 *@par Attributes:
 *value: A scale. Must be float. \n

 *@par Outputs:
 *@li y: A Tensor. Has the same type and shape as "x1". \n
 *y: A Tensor. Has the same type and shape as "x1". \n

 *@par Third-party framework compatibility:
 * Compatible with the Pytorch operator adds.
@@ -3329,8 +3391,441 @@ REG_OP(TensorRedirect)
    .OUTPUT(output_x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8,
                           DT_INT64, DT_INT16, DT_UINT16, DT_UINT64, DT_UINT32}))
    .OP_END_FACTORY_REG(TensorRedirect)
 }  // namespace ge

 /**
 * @brief Performs the element-wise division of tensor x2 by tensor x3,
 * multiply the result by the scalar value and add it to tensor x1

 * @par Inputs:
 * Three inputs, including:
 * @li input_data: A mutable input Tensor. Must be one of the following types:
 *     float16, float32.
 * @li x1: A mutable input Tensor of the same type as x1.
 * @li x2: A mutable input Tensor of the same type as x1.
 * @li value: A mutable input Tensor. Must be one of the following types:
 *     float16, float32, int32. \n

 * @par Outputs:
 * @li y: A mutable Tensor. Has the same type as "x1". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Addcdiv.
 */
 REG_OP(Addcdiv)
    .INPUT(input_data, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(value, TensorType({ DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(Addcdiv)

 /**
 * @brief Performs the element-wise multiplication of tensor x2 by tensor x3, 
 * multiply the result by the scalar value and add it to tensor input_data 


 * @par Inputs:
 * Three inputs, including:
 * @li input_data: A mutable input Tensor. Must be one of the following types:
 *     float16, float32, int8, int32, uint8.
 * @li x1: A mutable input Tensor of the same type as x1.
 * @li x2: A mutable input Tensor of the same type as x1.
 * @li value: A tensor which includes only one element of the same type as x1. \n

 * @par Outputs:
 * @li y: A mutable output Tensor. Has the same type as "x1". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Addcmul.
 */
 REG_OP(Addcmul)
    .INPUT(input_data, TensorType({ DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8 }))
    .INPUT(x1, TensorType({ DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8 }))
    .INPUT(x2, TensorType({ DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8 }))
    .INPUT(value, TensorType({ DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8 }))
    .OUTPUT(y, TensorType({ DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8 }))
    .OP_END_FACTORY_REG(Addcmul)

 /**
 * @brief Computes the result of x2 * alpha + x1.

 * @par Inputs:
 * @li x1: An ND tensor of type float16, float32, int32.
 * @li x2: An ND tensor of type float16, float32, int32.
 * @li alpha: A scalar tensor of type float16, float32. \n

 * @par Outputs:
 * @li y: An ND tensor tensor with the same shape and type as "x1". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Axpy.
 */
 REG_OP(AxpyV2)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(alpha, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OP_END_FACTORY_REG(AxpyV2)

 /**
 * @brief Computes the result of x1 - x2.

 * @par Inputs:
 * @li x1: An ND tensor of type float16, float, int32.
 * @li x2: An ND tensor of type float16, float, int32. \n

 * @par Outputs:
 * @li y: An ND tensor tensor with the same type as "x1". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Sub.
 */
 REG_OP(PtSub)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OP_END_FACTORY_REG(PtSub)

 /**
 * @brief Add the partial values of two tensors in format NC1HWC0.

 * @par Inputs:
 * @li x1: A Tensor in 5HD, and must be one of the following types: float16,
 * float32. \n
 * @li x2: A Tensor of the same type as "x1", and the same shape as "x1",
 * except for the C1 value. \n

 * @par Attributes:
 * @li x1_c1_offset: A required int. Offset value of C1 in "x1". \n
 * @li x2_c1_offset: A required int. Offset value of C1 in "x2". \n
 * @li c1_len: A required int. C1 len of "y". The value must be less than
 * the difference between C1 and offset in "x1" and "x2". \n

 * @par Outputs:
 * @li y:  A Tensor of the same type as "x1", and the same shape as "x1",
 * except for the C1 value. Record the result after adding. \n
 */
 REG_OP(StrideAdd)
    .INPUT(x1, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(x2, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .OUTPUT(y, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .REQUIRED_ATTR(x1_c1_offset, Int)
    .REQUIRED_ATTR(x2_c1_offset, Int)
    .REQUIRED_ATTR(c1_len, Int)
    .OP_END_FACTORY_REG(StrideAdd)

 /**
 * @brief Compare two tensors are totally equal or not, only output a bool value"

 * @par Inputs:
 * Two inputs, including:
 * @li input_x: A Tensor. the first tensor. \n
 * @li input_y: A Tensor. the second tensor. \n

 * @par Outputs:
 * @li output_z: A Tensor. Bool type, compare result of the two inputs. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch equal operator. \n
 */
 REG_OP(TensorEqual)
    .INPUT(input_x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .INPUT(input_y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .OUTPUT(output_z, TensorType({DT_BOOL}))
    .OP_END_FACTORY_REG(TensorEqual)

 /**
 * @brief Element-wise min of each of the input tensors (with Numpy-style broadcasting support). 
 * All inputs and outputs must have the same data type. This operator supports multidirectional 
 * (i.e., Numpy-style) broadcasting
 * 
 * @par inputs
 * one input including:
 * @li x: dynamic input A Tensor. Must be one of the following types: float32, float16, double, int32, int64
 * 
 * @par output
 * one output including:
 * @li y:A Tensor of the same type as x
 * 
 */
 REG_OP(MaxN)
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64})) 
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64}))
    .OP_END_FACTORY_REG(MaxN)


 /**
 * @brief Calculates x * maske * value.
 *
 * @par Inputs:
 * @li x: An tensor of type float16 or float32, specifying the input to the data layer.
 * @li mask: An tensor of type int8 or float16 or float32, be same shape with x. \n
 *
 * @par Attributes:
 * value: A optional float. \n
 *
 * @par Outputs:
 * y: The output tensor of type float16 or float32.
 @ li y:A Tensor of the same type and shape as x
 *
 */
 REG_OP(MaskedScale)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32}))
    .INPUT(mask, TensorType({DT_INT8, DT_FLOAT16, DT_FLOAT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32}))
    .REQUIRED_ATTR(value, Float)
    .OP_END_FACTORY_REG(MaskedScale)

 /**
 * @brief Calculate the lerp function. \n

 * @par Inputs:
 * Three inputs, including:
 * @li start: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * @li end: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * @li weight: A tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Outputs:
 * y: A Tensor with the same type and shape of input_x's. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Lerp. \n
 */
 REG_OP(Lerp)
    .INPUT(start, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(end, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(Lerp)

 /**
 *@brief Returns the num value of abs(x1-x2) > atol+rtol*abs(x2) element-wise. \n

 *
 *@par Inputs:
 *@li x1: A tensor. Must be one of the following types: float32, int32, uint8, int8, float16
 *@li x2: A tensor of the same type as "x1".
 *
 *@par Attributes:
 * atol: Defaults to "1e-05".
 * rtol: Defaults to "1e-03".
 *
 *@par Outputs:
 * num: A tensor of type float32.
 *
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 *
 */
 REG_OP(DataCompare)
  .INPUT(x1, TensorType({ DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT32 }))
  .INPUT(x2, TensorType({ DT_FLOAT16, DT_FLOAT,DT_INT8, DT_UINT8, DT_INT32 }))
  .OUTPUT(num, TensorType({DT_FLOAT}))
  .ATTR(atol, Float, 1e-5)
  .ATTR(rtol, Float, 1e-3)
  .OP_END_FACTORY_REG(DataCompare)

 /**
 *@brief Hardmax(element in input, axis) = 1 if the element is the first maximum value along the specified axis, 0
 *otherwise The input does not need to explicitly be a 2D vector.The "axis" attribute indicates the dimension along
 *which Hardmax will be performed.The output tensor has the same shape and contains the Hardmax values of the
 *corresponding input.
 *
 *@par inputs
 *one input including:
 *@li x: input A Tensor.Must be one of the following types:float32,float16
 *
 *@par Attributes:
 *@li axis:A required int attribute that decides which dimension will be used to cal the hard_max
 *
 *@par output:
 *one output including:
 *@li y:A Tensor of the same type as x
 *
 */
 REG_OP(HardMax)
    .INPUT(x, TensorType({ DT_FLOAT16, DT_FLOAT }))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(axis, Int, -1)
    .OP_END_FACTORY_REG(HardMax)

 /**
 * @brief Computes the dot product (inner product) of two tensors. This function does not broadcast.

 * @par Inputs:
 * Two inputs, including:
 * @li input_x: A Tensor. the first tensor must be 1d. \n
 * @li input_y: A Tensor. the second tensor must be 1d. \n

 * @par Outputs:
 * @li output: A Tensor. Result of the two inputs, must be 1d. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch dot operator. \n
 */
 REG_OP(Dot)
    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
    .INPUT(input_y, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
    .OUTPUT(output, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
    .OP_END_FACTORY_REG(Dot)
 	
 /**
 *@brief Returns a new tensor with boolean elements representing \n
 *if each element of input is “close” to the corresponding element of other \n

 *@par Inputs:
 *Two inputs, including:
 * @li x1: A tensor. Must be one of the following types:
 *     float16, float32, int32. \n
 * @li x2: A tensor with the same type and shape of x1's. \n

 *@par Attributes:
 *@li rtol: An optional float.Defaults to 1e-05. \n
 *@li atol: An optional float.Defaults to 1e-08. \n
 *@li equal_nan: An optional bool.Defaults to false. \n

 *@par Outputs:
 *y: A Tensor bool with the same shape of x1's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator isclose. \n
 */
 REG_OP(IsClose)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_BOOL}))
    .ATTR(rtol, Float, 1e-05)
    .ATTR(atol, Float, 1e-08)
    .ATTR(equal_nan, Bool, false)
    .OP_END_FACTORY_REG(IsClose)

 /**
 * @brief Returns the reverse tensor of the ArgMax operator of a tensor. \n

 * @par Inputs:
 * three input, including:
 * var: A Tensor of type float16, float32, int32 or int8. \n
 * indices: A Tensor of type int32. \n
 * updates: A Tensor of type float16, float32, int32 or int8. \n

 * @par Attributes:
 * @li dimension: An integer of type int, specifying the axis information of the index with the maximum value.\n

 * @par Outputs:
 * y: A Tensor of type float16, float32, int32 or int8. \n
 *
 *@attention Constraints:
 *@li indices: only support int32,and shape same to "updates"
 *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". 
 *@li y:A Tensor, the type and shape is same to "var" \n

 *@par Third-party framework compatibility
 * not support all scene like pytorch operator scatter
 * exp:
 * var.shape=[2,3,4,5], dim=2, the shape of indices and updates should be [2,3,5]
 * not support the shape of indices and updates is [2,3,2,5] like pytorch operator scatter. \n
 */
 REG_OP(ArgMaxGrad)
    .INPUT(var, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .REQUIRED_ATTR(dimension, Int)
    .OP_END_FACTORY_REG(ArgMaxGrad)

 /**
 * @brief Returns the reverse tensor of the ArgMax operator of a tensor. \n

 * @par Inputs:
 * three input, including:
 * var: A Tensor of type float16, float32, int32 or int8. \n
 * indices: A Tensor of type int32. \n
 * updates: A Tensor of type float16, float32, int32 or int8. \n
 * assist: A Tensor of int32,also a assist matrix and it's shape must match the shape of var \n

 * @par Attributes:
 * @li dimension: An integer of type int, specifying the axis information of the index with the maximum value.\n

 * @par Outputs:
 * y: A Tensor of type float16, float32, int32 or int8. \n

 *@attention Constraints:
 *@li indices: only support int32,and shape same to "updates"
 *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". 
 *@li y:A Tensor, the type and shape is same to "var" \n

 *@par Third-party framework compatibility
 * not support all scene like pytorch operator scatter
 * exp:
 * var.shape=[2,3,4,5], dim=2, the shape of indices and updates should be [2,3,5]
 * not support the shape of indices and updates is [2,3,2,5] like pytorch operator scatter. \n
 */
 REG_OP(ArgMaxGradD)
    .INPUT(var, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .INPUT(assist, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .REQUIRED_ATTR(dimension, Int)
    .OP_END_FACTORY_REG(ArgMaxGradD)

 /**
 *@brief Calculates the reversed outputs of the function "AddMatMatElements"
 *  c = c * beta + alpha * a * b

 *@par Inputs:
 *Three inputs, including:
 * @li c: A mutable Tensor. Must be one of the following types:
 *     float16, float32.
 * @li a: A mutable Tensor of the same type as "c".
 * @li b: A mutable Tensor of the same type as "c".
 * @li beta: A mutable scalar of the same type as "c".
 * @li alpha: A mutable scalar of the same type as "c". \n

 *@par Outputs:
 * @li c: A mutable Tensor. Has the same type as "c". \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator AddMatMatElements.
 */
 REG_OP(AddMatMatElements)
    .INPUT(c, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(a, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(b, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(alpha, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(c, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(AddMatMatElements)

 /**
 *@brief Returns cosine similarity between x1 and x2,computed along dim. \n

 *@par Inputs:
 *Two inputs, including:
 * @li input_x1: A tensor. Must be the following types:
 *     float32. \n

 *@par Inputs:
 *@li input_x2: A tensor. Must of the following types:
 *     float32. \n

 *@par Outputs:
 *@li output_y: A Tensor with the same type of input_x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator CosineSimilarity. \n
 */
 REG_OP(CosineSimilarity)
    .INPUT(input_x1, TensorType({DT_FLOAT}))  /* "First operand." */
    .INPUT(input_x2, TensorType({DT_FLOAT}))  /* "Second operand." */
    .OUTPUT(output_y, TensorType({DT_FLOAT})) /* "Result, has same element type as two inputs" */
    .ATTR(dim, Int, 1)
    .ATTR(eps, Float, 1e-8)
    .OP_END_FACTORY_REG(CosineSimilarity)

 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_ELEWISE_CALCULATION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/functional_ops.h
+++ b/third_party/fwkacllib/inc/ops/functional_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/get_data_ops.h
+++ b/third_party/fwkacllib/inc/ops/get_data_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/globalavgpool.h
+++ b/third_party/fwkacllib/inc/ops/globalavgpool.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /*!
 * \file globalavgpool.h
 * \brief
 */
 #ifndef OPS_BUILT_IN_OP_PROTO_INC_GLOBALAVERAGEPOOL_H_
 #define OPS_BUILT_IN_OP_PROTO_INC_GLOBALAVERAGEPOOL_H_

 #include "graph/operator_reg.h"

 namespace ge {
 /**
 *@brief GlobalAveragePool consumes an input tensor X and applies average pooling across the values in the same channel.
 This is equivalent to AveragePool with kernel size equal to the spatial dimension of input tensor \n

 *@par Inputs:
 *@li x: Input data tensor from the previous operator; dimensions for image case are (N x C x H x W),
 where N is the batch size, C is the number of channels, and H and W are the height and the width of the data.
 For non image case, the dimensions are in the form of (N x C x D1 x D2 ... Dn), where N is the batch size.

 *@par Outputs:
 *y: Output data tensor from pooling across the input tensor. The output tensor has the same rank as the input.
 The first two dimensions of output shape are the same as the input (N x C), while the other dimensions are all 1

 *@par Restrictions:
 *Warning: This operator can be integrated only by configuring INSERT_OP_FILE of aclgrphBuildModel. Please do not use it directly.
 */
 REG_OP(GlobalAveragePool)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(GlobalAveragePool)
 } // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_GLOBALAVGPOOL_H_
--- a/third_party/fwkacllib/inc/ops/hcom_ops.h
+++ b/third_party/fwkacllib/inc/ops/hcom_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,8 +45,6 @@ REG_OP(HcomAllGather)
    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
    .REQUIRED_ATTR(rank_size, Int)
    .REQUIRED_ATTR(group, String)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.0)
    .OP_END_FACTORY_REG(HcomAllGather)

 /**
@@ -77,8 +75,6 @@ REG_OP(HcomAllReduce)
    .REQUIRED_ATTR(group, String)
    .ATTR(fusion, Int, 1)
    .ATTR(fusion_id, Int, -1)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.0)
    .OP_END_FACTORY_REG(HcomAllReduce)

 /**
@@ -91,7 +87,7 @@ REG_OP(HcomAllReduce)
  input of this rank will be broadcast to other ranks.
 * @li fusion: A required integer identifying if the op need to fusion,the 
  default value is none fusion
  * @li fusion: A required integer identifying the fusion id if para fusion
  * @li fusion_id: A required integer identifying the fusion id if para fusion
  is set.
 * @li group: A required string identifying the group name of ranks
  participating in the op.
@@ -109,10 +105,39 @@ REG_OP(HcomBroadcast)
    .REQUIRED_ATTR(group, String)
    .ATTR(fusion, Int, 0)
    .ATTR(fusion_id, Int, -1)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.0)
    .OP_END_FACTORY_REG(HcomBroadcast)

 /**
 * @brief preforms reduction from others rank to rootrank
 * @par Inputs:
 * @li root_rank: A required integer identifying the root rank in the op
  the reduction result will be on this root rank
 * x: A tensor. Must be one of the following types: int8, int16, int32, float16,
  float32.
 * @par Attributes:
 * @li reduction: A required string identifying the reduction operation to
  perform.The supported operation are: "sum", "max", "min", "prod".
 * @li group: A required string identifying the group name of ranks
  participating in the op.
 * @li fusion: An optional integer identifying the fusion flag of the op.
  0: no fusion; 1 (default): fusion; 2: fusion the ops by fusion id.
 * @li fusion_id: An optional integer identifying the fusion id of the op.
 * The HcomReduce ops with the same fusion id will be fused.
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 * @attention Constraints:
 *"group" is limited to 128 characters. Use "hccl_world_group"
  as the name of a world group.
 */
 REG_OP(HcomReduce)
    .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16}))
    .REQUIRED_ATTR(root_rank, Int)
    .REQUIRED_ATTR(reduction, String)
    .REQUIRED_ATTR(group, String)
    .ATTR(fusion, Int, 0)
    .ATTR(fusion_id, Int, -1)
    .OP_END_FACTORY_REG(HcomReduce)
 /**
 * @brief Performs reduction across all input tensors, scattering in equal
  blocks among ranks, each rank getting a chunk of data based on its rank
@@ -139,8 +164,6 @@ REG_OP(HcomReduceScatter)
    .REQUIRED_ATTR(reduction, String)
    .REQUIRED_ATTR(group, String)
    .REQUIRED_ATTR(rank_size, Int)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.0)
    .OP_END_FACTORY_REG(HcomReduceScatter)

 /**
@@ -167,8 +190,6 @@ REG_OP(HcomSend)
    .REQUIRED_ATTR(group, String)
    .REQUIRED_ATTR(sr_tag, Int)
    .REQUIRED_ATTR(dest_rank, Int)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.0)
    .OP_END_FACTORY_REG(HcomSend)

 /**
@@ -202,8 +223,6 @@ REG_OP(HcomReceive)
    .REQUIRED_ATTR(src_rank, Int)
    .REQUIRED_ATTR(shape, ListInt)
    .REQUIRED_ATTR(dtype, Type)
    .ATTR(alpha, Float, 1.0)
    .ATTR(beta, Float, 0.0)
    .OP_END_FACTORY_REG(HcomReceive)

 /**
@@ -219,6 +238,15 @@ REG_OP(HcomRemoteRead)
    .REQUIRED_ATTR(dtype, Type)
    .OP_END_FACTORY_REG(HcomRemoteRead)

 /**
 * @brief Performs Remote Ref Read of input tensors
 * @par Inputs:
 * remote: A tensor. describing the remote memory address to read: u64 remoteId, u64 addrRemote, u64 length
 * cache_var: The local base address
 * local_offset: Skip step length
 * @par Outputs:
 * cache_var: The local base address
 */
 REG_OP(HcomRemoteRefRead)
    .INPUT(remote, TensorType({DT_UINT64}))
    .INPUT(cache_var, TensorType({DT_UINT64}))
@@ -239,11 +267,90 @@ REG_OP(HcomRemoteWrite)
    .INPUT(local, TensorType::ALL())
    .OP_END_FACTORY_REG(HcomRemoteWrite)

 /**
 * @brief Performs Remote Write of input tensors
 * @par Inputs:
 * remote: A tensor. describing the remote memory address to write: u64 remoteId, u64 addrRemote, u64 length
 * @par Inputs:
 * local: A Tensor. whose value is length / size_of(Type)
 */
 REG_OP(HcomRemoteScatterWrite)
    .INPUT(remote, TensorType({DT_INT64, DT_UINT64}))
    .INPUT(local, TensorType::ALL())
    .OPTIONAL_INPUT(local_offset, TensorType({DT_UINT64}))
    .OP_END_FACTORY_REG(HcomRemoteScatterWrite)

 /**
 * @brief All ranks send different amount of data to, and receive different
  amount of data from, all ranks.
 * @par Inputs:
 * Five inputs, including:
 * @li send_data: A tensor. the memory to send.
 * @li send_counts: A list, where entry i specifies the number of elements in
  send_data to send to rank i.
 * @li send_displacements: A list, where entry i specifies the displacement
  (offset from sendbuf) from which to send data to rank i.
 * @li recv_counts: A list, where entry i specifies the number of 
  elements to receive from rank i.
 * @li recv_displacements: A list, , where entry i specifies the displacement
  (offset from recv_data) to which data from rank i should be written.
 * @par Outputs:
 * recv_data: A Tensor  has same element type as send_data.
 * @par Attributes:
 * @li group: A string identifying the group name of ranks participating in
  the op.
 * @attention all ranks participating in the op should be full-mesh networking
  using the RDMA.
 */
 REG_OP(HcomAllToAllV)
    .INPUT(send_data, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
    .INPUT(send_counts, TensorType({DT_INT64}))
    .INPUT(send_displacements, TensorType({DT_INT64}))
    .INPUT(recv_counts, TensorType({DT_INT64}))
    .INPUT(recv_displacements, TensorType({DT_INT64}))
    .OUTPUT(recv_data, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
    .REQUIRED_ATTR(group, String)
    .OP_END_FACTORY_REG(HcomAllToAllV)

 /**
 * @brief All ranks send different amount of data to, and receive different
  amount of data from, all ranks. And concat all data descripting by addrinfo
  togather into output gathered.
 * @par Inputs:
 * Four inputs, including:
 * @li addrinfo: A tensor, descripting the memory info(address, length) to send.
 * @li addrinfo_count_per_rank: A list, where entry i specifies the number of
  elements in send_data to send to rank i.
 * @li recv_counts: A list, where entry i specifies the number of 
  elements to receive from rank i.
 * @li recv_displacements: A list, , where entry i specifies the displacement 
  (offset from recv_data) to which data from rank i should be written.
 * @par Outputs:
 * Two outputs, including:
 * @li recv_data: A Tensor  has same element type as dtype.
 * @li gathered: A Tensor  has same element type as dtype.
 * @par Attributes:
 * @li group: A string identifying the group name of ranks participating in
  the op.
 * @li dtype: Datatype of send buffer elements.
 * @li addr_length: descripting the element memory length in the addrinfo.
  -2: all element memory length in the addrinfo is the same, but it is unknown.
  -1: all element memory length is unknown.
  >0: all element memory length in the addrinfo is the same. the attr value is the memory length.
 * @attention all ranks participating in the op should be full-mesh networking
  using the RDMA.
 */
 REG_OP(HcomGatherAllToAllV)
    .INPUT(addrinfo, TensorType({DT_UINT64}))
    .INPUT(addrinfo_count_per_rank, TensorType({DT_INT64}))
    .INPUT(recv_counts, TensorType({DT_INT64}))
    .INPUT(recv_displacements, TensorType({DT_INT64}))
    .OUTPUT(recv_data, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
    .OUTPUT(gathered, TensorType({DT_FLOAT, DT_INT32, DT_INT8, DT_INT16, DT_FLOAT16, DT_INT64, DT_UINT64}))
    .REQUIRED_ATTR(group, String)
    .REQUIRED_ATTR(dtype, Type)
    .REQUIRED_ATTR(addr_length, Int)
    .OP_END_FACTORY_REG(HcomGatherAllToAllV)

 } // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_HCOM_OPS_H_
--- a/third_party/fwkacllib/inc/ops/hvd_ops.h
+++ b/third_party/fwkacllib/inc/ops/hvd_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,22 @@
 #include "graph/operator_reg.h"

 namespace ge {
 /**
 *@brief Decode the frame(s) of a GIF-encoded image to a uint8 tensor . \n

 *@par Inputs:
 *@li contents:A Tensor of type string. 0-D. The GIF-encoded image. \n

 *@par Outputs:
 *image:A Tensor of type uint8. \n

 *@par Third-party framework compatibility
 *Compatible with tensorflow DecodeGif operator.
 */
 REG_OP(DecodeGif)
    .INPUT(contents, TensorType({DT_STRING}))
    .OUTPUT(image, TensorType({DT_UINT8}))
    .OP_END_FACTORY_REG(DecodeGif)

 /**
 *@brief Adjust the hue of one or more images . \n
@@ -31,11 +47,12 @@ namespace ge {
 *@par Inputs:
 *Input images is a tensor of at least 3 dimensions. The last dimension is
 interpretted as channels, and must be three. Inputs include:
 *@li images:A Tensor of type float. Images to adjust. At least 3-D.
 *@li images:A Tensor of type float. Images to adjust. At least 3-D. The format
 must be NHWC.
 *@li delta:A Tensor of type float. A float delta to add to the hue . \n

 *@par Outputs:
 *y:A Tensor of type float . \n
 *y:A Tensor of type float. The format must be NHWC. \n

 *@attention Constraints:
 *Input images is a tensor of at least 3 dimensions. The last dimension is
@@ -57,11 +74,12 @@ REG_OP(AdjustHue)
 *@par Inputs:
 *Input images is a tensor of at least 3 dimensions. The last dimension is
 interpretted as channels, and must be three. Inputs include:
 *@li images:A Tensor of type float. Images to adjust. At least 3-D.
 *@li images:A Tensor of type float. Images to adjust. At least 3-D. The format
 must be NHWC.
 *@li scale:A Tensor of type float. A float scale to add to the saturation . \n

 *@par Outputs:
 *y:A Tensor of type float . \n
 *y:A Tensor of type float. The format must be NHWC. \n

 *@attention Constraints:
 *Input images is a tensor of at least 3 dimensions. The last dimension is
@@ -83,11 +101,12 @@ REG_OP(AdjustSaturation)
 *@par Inputs:
 *Input images is a tensor of at least 3 dimensions. The last 3 dimensions are
 interpreted as '[height, width, channels]'. Inputs include:
 *@li images:A Tensor of type float. Images to adjust. At least 3-D.
 *@li images:A Tensor of type float. Images to adjust. At least 3-D. The format
 must be NHWC.
 *@li scale:A Tensor of type float. A float multiplier for adjusting contrast . \n

 *@par Outputs:
 *y:A Tensor of type float . \n
 *y:A Tensor of type float. The format must be NHWC. \n

 *@attention Constraints:
 *Input images is a tensor of at least 3 dimensions. The last dimension is
@@ -112,7 +131,7 @@ nearest neighbor sampling to a common output size specified by crop_size . \n
 *Input images must be a 4-D tensor. Inputs include:
 *@li images:A Tensor. Must be one of the following types:uint8, uint16, int8,
 int16, int32, int64, float16, float, double. A 4-D tensor of shape
 [batch, image_height, image_width, depth].
 [batch, image_height, image_width, depth]. The format must be NHWC.
 *@li boxes: A Tensor of type float. A 2-D tensor of shape [num_boxes, 4].
 *@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with
 int32 values in [0, batch).
@@ -127,7 +146,7 @@ extrapolation, when applicable.
 NearestNeighbor . \n

 *@par Outputs:
 *y:A Tensor of type float . \n
 *y:A Tensor of type float. The format must be NHWC. \n

 *@attention Constraints:
 *Input images must be a 4-D tensor . \n
@@ -193,7 +212,9 @@ boxes tensor . \n
 *@par Inputs:
 *Input images and grads must be a 4-D tensor. Inputs include:
 *@li grads: A 4-D tensor of shape [num_boxes, crop_height, crop_width, depth].
 The format must be NHWC.
 *@li images: A 4-D tensor of shape [batch, image_height, image_width, depth].
 The format must be NHWC.
 Both image_height and image_width need to be positive.
 *@li boxes: A 2-D tensor of shape [num_boxes, 4]. The i-th row of the tensor
 specifies the coordinates of a box in the box_ind[i] image and is specified in
@@ -233,6 +254,7 @@ images tensor . \n
 *@par Inputs:
 *Input grads must be a 4-D tensor. Inputs include:
 *@li grads: A 4-D tensor of shape [num_boxes, crop_height, crop_width, depth].
 The format must be NHWC.
 *@li boxes: A 2-D tensor of shape [num_boxes, 4]. The i-th row of the tensor
 specifies the coordinates of a box in the box_ind[i] image and is specified
 in normalized coordinates [y1, x1, y2, x2].
@@ -248,7 +270,8 @@ method: A string specifying the interpolation method. Only 'bilinear' is
 supported for now . \n

 *@par Outputs:
 *y:A 4-D tensor of shape [batch, image_height, image_width, depth] . \n
 *y:A 4-D tensor of shape [batch, image_height, image_width, depth]. The format
 must be NHWC. \n

 *@attention Constraints:
 *Input grads must be a 4-D tensor . \n
@@ -273,6 +296,7 @@ REG_OP(CropAndResizeGradImage)
 *@par Inputs:
 *Input x must be a 4-D tensor. Inputs include:
 *@li x: A 4-D float tensor of shape [batch_size, height, width, channels].
 The format must be NHWC.
 *@li size: A 1-D tensor of 2 elements containing the size of the glimpses to
 extract. The glimpse height must be specified first, following by the glimpse
 width.
@@ -293,7 +317,7 @@ uniform_noise . \n

 *@par Outputs:
 *y:A tensor representing the glimpses [batch_size, glimpse_height,
 glimpse_width, channels] . \n
 glimpse_width, channels]. The format must be NHWC. \n

 *@attention Constraints:
 *Input x must be a 4-D tensor . \n
@@ -340,7 +364,8 @@ REG_OP(HSVToRGB)

 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li images: 4-D with shape [batch, height, width, channels].
 *@li images: 4-D with shape [batch, height, width, channels]. The format must
 be NHWC.
 *@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new
 size for the images.
 *@li min: A Tensor of type float.
@@ -354,6 +379,7 @@ the values at the corner pixels. Defaults to false.

 *@par Outputs:
 *@li resized_images: 4-D with shape [batch, new_height, new_width, channels].
 The format must be NHWC.
 *@li y_min: A Tensor of type float.
 *@li y_max: A Tensor of type float . \n

@@ -381,7 +407,8 @@ REG_OP(QuantizedResizeBilinear)

 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li images: 4-D with shape [batch, height, width, channels].
 *@li images: 4-D with shape [batch, height, width, channels]. The format must
 be NHWC.
 *@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width.
 The new size for the images . \n

@@ -391,7 +418,8 @@ output tensors are aligned, preserving the values at the corner pixels.
 Defaults to false . \n

 *@par Outputs:
 *y: 4-D with shape [batch, new_height, new_width, channels] . \n
 *y: 4-D with shape [batch, new_height, new_width, channels]. The format must
 be NHWC. \n

 *@attention Constraints:
 *Input images can be of different types but output images are always float . \n
@@ -414,10 +442,10 @@ REG_OP(ResizeArea)
 *@par Inputs:
 *Input grads must be a 4-D tensor. Inputs include:
 *@li grads: A Tensor of type float. 4-D with shape [batch, height, width,
 channels].
 channels]. The format must be NHWC.
 *@li original_image: A Tensor. Must be one of the following types: float,
 double. 4-D with shape [batch, orig_height, orig_width, channels], The image
 tensor that was resized . \n
 tensor that was resized. The format must be NHWC. \n

 *@par Attributes:
 *@li align_corners: An optional bool. Defaults to False. If true, the centers
@@ -426,10 +454,10 @@ false.
 *@li half_pixel_centers: An optional bool. Defaults to False . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as original_image . \n
 *y: A Tensor. Has the same type as original_image. The format must be NHWC. \n

 *@attention Constraints:
 *Input images can be of different types but output images are always float . \n
 *Input images can be of different types but output images are always float .

 *@par Third-party framework compatibility
 *Compatible with tensorflow ResizeBicubicGrad operator.
@@ -448,7 +476,8 @@ REG_OP(ResizeBicubicGrad)

 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li images: 4-D with shape [batch, height, width, channels].
 *@li images: 4-D with shape [batch, height, width, channels]. The format
 must be NHWC.
 *@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new
 size for the images . \n

@@ -459,10 +488,11 @@ Defaults to false.
 *@li half_pixel_centers: An optional bool. Defaults to False . \n

 *@par Outputs:
 *y: 4-D with shape [batch, new_height, new_width, channels] . \n
 *y: 4-D with shape [batch, new_height, new_width, channels]. The format
 must be NHWC. \n

 *@attention Constraints:
 *Input images can be of different types but output images are always float . \n
 *Input images can be of different types but output images are always float .

 *@par Third-party framework compatibility
 *Compatible with tensorflow ResizeBicubic operator.
@@ -483,7 +513,7 @@ REG_OP(ResizeBicubic)
 *@par Inputs:
 *Input grads must be a 4-D tensor. Inputs include:
 *@li grads: A Tensor. Must be one of the following types: uint8, int8, int32,
 float16, float, double. 4-D with shape [batch, height, width, channels].
 float16, float, double. Must set the format, supported format list ["NCHW, NHWC"]
 *@li size: A 1-D int32 Tensor of 2 elements: orig_height, orig_width.
 The original input size . \n

@@ -550,9 +580,8 @@ REG_OP(ResizeNearestNeighborV2GradD)

 *@par Inputs:
 *Input grads must be a 4-D tensor. Inputs include:
 *@li grads: A Tensor of type float32. 4-D with shape [batch, height, width,
 channels].
 *@li original_image: A Tensor. 4-D with shape [batch, orig_height, orig_width,
 *@li grads: A Tensor of type float32. Must set the format, supported format list ["NCHW, NHWC"]
 *@li original_image: A Tensor. 4-D shape. Must set the format, supported format list ["NCHW, NHWC"]
 channels], The image tensor that was resized . \n

 *@par Attributes:
@@ -583,7 +612,7 @@ REG_OP(ResizeBilinearV2Grad)

 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li x: 4-D with shape [batch, height, width, channels].
 *@li x: 4-D tensor. Must set the format, supported format list ["NCHW, NHWC"]
 *@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width. The new
 size for the images . \n

@@ -639,6 +668,62 @@ REG_OP(RGBToHSV)
 /**
 *@brief Generate a single randomly distorted bounding box for an image . \n

 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li image_size: 1-D, containing [height, width, channels].
 *@li bounding_boxes: 3-D with shape [batch, N, 4] describing the N bounding
 boxes associated with the image. \n

 *@par Attributes:
 *@li seed: If either seed or seed2 are set to non-zero, the random number
 generator is seeded by the given seed. Otherwise, it is seeded by a random seed.
 *@li seed2: A second seed to avoid seed collision.
 *@li min_object_covered: The cropped area of the image must contain at least
 this fraction of any bounding box supplied. The value of this parameter should
 be non-negative. In the case of 0, the cropped area does not need to overlap
 any of the bounding boxes supplied .
 *@li aspect_ratio_range: The cropped area of the image must have an aspect
 ratio = width / height within this range.
 *@li max_attempts: Number of attempts at generating a cropped region of the
 image of the specified constraints. After max_attempts failures, return the
 entire image.
 *@li use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes
 supplied. If true, assume an implicit bounding box covering the whole input.
 If false, raise an error . \n

 *@par Outputs:
 *@li begin: 1-D, containing [offset_height, offset_width, 0].
 *@li size: 1-D, containing [target_height, target_width, -1].
 *@li bboxes: 3-D with shape [1, 1, 4] containing the distorted bounding box . \n

 *@attention Constraints:
 *Input images can be of different types but output images are always float . \n

 *@par Third-party framework compatibility
 *Compatible with tensorflow SampleDistortedBoundingBox operator.
 */

 REG_OP(SampleDistortedBoundingBox)
    .INPUT(image_size, TensorType({ DT_UINT8, DT_INT8, DT_INT16, \
        DT_INT32, DT_INT64 }))
    .INPUT(bounding_boxes, TensorType({ DT_FLOAT }))
    .OUTPUT(begin, TensorType({ DT_UINT8, DT_INT8, DT_INT16, \
        DT_INT32, DT_INT64 }))
    .OUTPUT(size, TensorType({ DT_UINT8, DT_INT8, DT_INT16, \
        DT_INT32, DT_INT64 }))
    .OUTPUT(bboxes, TensorType({ DT_FLOAT }))
    .ATTR(seed, Int, 0)
    .ATTR(seed2, Int, 0)
    .ATTR(min_object_covered, Float, 0.1f)
    .ATTR(aspect_ratio_range, ListFloat, { 0.75f, 1.33f })
    .ATTR(area_range, ListFloat, { 0.05f, 1.0f })
    .ATTR(max_attempts, Int, 100)
    .ATTR(use_image_if_no_bounding_boxes, Bool, false)
    .OP_END_FACTORY_REG(SampleDistortedBoundingBox)

 /**
 *@brief Generate a single randomly distorted bounding box for an image . \n

 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li image_size: 1-D, containing [height, width, channels].
@@ -697,7 +782,7 @@ REG_OP(SampleDistortedBoundingBoxExt2)

 *@par Inputs:
 *Input x must be a 4-D tensor. Inputs include:
 *@li x: 4-D with shape [batch, height, width, channels].
 *@li x: 4-D tensor. Must set the format, supported format list ["NCHW, NHWC"].
 *@li size: A 1-D int32 Tensor of 2 elements: new_height, new_width.
 The new size for the images . \n

@@ -729,12 +814,12 @@ REG_OP(ResizeNearestNeighborV2)
 *@par Inputs:
 *Input images must be a 4-D tensor. Inputs include:
 *@li images: A Tensor. Must be one of the following types: float. 4-D with
 shape [batch, height, width, depth]. A batch of images.
 shape [batch, height, width, depth]. A batch of images. The format must be NHWC.
 *@li boxes: A Tensor of type float32. 3-D with shape [batch,
 num_bounding_boxes, 4] containing bounding boxes . \n

 *@par Outputs:
 *A Tensor. Has the same type as images . \n
 *A Tensor. Has the same type as images. The format must be NHWC. \n

 *@attention Constraints:
 *Input images must be a 4-D tensor . \n
@@ -1002,6 +1087,88 @@ REG_OP(EncodePng)
    .ATTR(compression, Int, -1)
    .OP_END_FACTORY_REG(EncodePng)


 /**
 *@brief PNG-decode an image.
 *@par Inputs:
 *contents: 0-D. PNG-decoded image .

 *@par Attributes:
 *channels: graph channels \n
 *dtype: type of image

 *@par Outputs:
 *image: is a 3-D uint8 or uint16 Tensor of shape [height, width, channels]
 where channels is: 1: for grayscale; 2: for grayscale + alpha; 3: for RGB;
 4: for RGBA . \n

 *@par Third-party framework compatibility
 *Compatible with tensorflow DecodePng operator.
 */
 REG_OP(DecodePng)
    .INPUT(contents, TensorType({DT_STRING}))
    .OUTPUT(image, TensorType({DT_UINT8, DT_UINT16}))
    .ATTR(dtype, Type, DT_UINT8)
    .ATTR(channels, Int, 0)
    .OP_END_FACTORY_REG(DecodePng)

 /**
 *@brief Bmp-decode an image. \n

 *@par Inputs:
 *@li contents: A Tensor of type string. 0-D. The BMP-encoded image. \n

 *@par Attributes:
 *@li channels: Decode the desired number of color channels of the image. \n

 *@par Outputs:
 *image: A Tensor dtype of uint8.

 * @par Third-party framework compatibility
 * Compatible with tensorflow DecodeBmp operator.
 */

 REG_OP(DecodeBmp)
    .INPUT(contents, TensorType({DT_STRING}))
    .OUTPUT(image, TensorType({DT_UINT8}))
    .ATTR(channels, Int, 0)
    .OP_END_FACTORY_REG(DecodeBmp)

 /**
 *@brief Function parse image from string to int. \n

 *@par Inputs:
 *@li contents: A Tensor of type string. 0-D. The JPEG-encoded image. \n
 *@li crop_window: 1-D. The crop window: [crop_y, crop_x, crop_height, crop_width]. \n

 *@par Attributes:
 *@li channels: An optional int. Defaults to 0. Number of color channels for the
 *decoded image.
 *@li ratio: An optional int. Defaults to 1. Downscaling ratio.
 *@li fancy_upscaling: An optional bool. Defaults to True. If true use a slower
 *but nicer upscaling of the chroma planes
 *@li try_recover_truncated: An optional bool. Defaults to False. If true try to
 *recover an image from truncated input.
 *@li acceptable_fraction: An optional float. Defaults to 1. The minimum required
 fraction of lines before a truncated input is accepted.
 *@li dct_method: An optional string. Defaults to "". string specifying a hint
 *about the algorithm used for decompression. \n

 *@par Outputs:
 *image: A Tensor dtype of uint8.
 */
 REG_OP(DecodeAndCropJpeg)
    .INPUT(contents, TensorType({DT_STRING}))
    .INPUT(crop_window, TensorType({DT_INT32}))
    .OUTPUT(image, TensorType({DT_UINT8}))
    .ATTR(channels, Int, 0)
    .ATTR(ratio, Int, 1)
    .ATTR(fancy_upscaling, Bool, true)
    .ATTR(try_recover_truncated, Bool, false)
    .ATTR(acceptable_fraction, Float, 1.0)
    .ATTR(dct_method, String, "")
    .OP_END_FACTORY_REG(DecodeAndCropJpeg)

 /**
 *@brief Resizes "images" to "size" using bilinear interpolation . \n

@@ -1316,6 +1483,55 @@ REG_OP(CombinedNonMaxSuppression)
    .ATTR(clip_boxes, Bool, true)
    .OP_END_FACTORY_REG(CombinedNonMaxSuppression)

 /**
 *@brief Resizes "images" with "offset" using bilinear interpolation. \n

 *@par Inputs:
 *@li img: input image, A 4-D tensor of shape `[n, h, w, c]`.
 *@li warp_offset: the resize offset A 4-D float tensor of shape `[n, h, w, 2]`, 2 means (x, y) for offset point.

 *@par Outputs:
 *warp_img: A Tensor after resize. \n
 */
 REG_OP(IMGWarp)
    .INPUT(img, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT32}))
    .INPUT(warp_offset, TensorType({DT_FLOAT32}))
    .OUTPUT(warp_img, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT32}))
    .OP_END_FACTORY_REG(IMGWarp)

 /**
 *@brief Resizes "images" with "offset" using bilinear interpolation. \n

 *@par Inputs:
 *@li img: input image, A 4-D tensor of shape `[n, h, w, c]`.
 *@li map_offset: the resize offset A 4-D float tensor of shape `[n, h, w, 2]`, 2 means (x, y) for resize point.

 *@par Outputs:
 *map_img: A Tensor after resize. \n
 */
 REG_OP(Remap)
    .INPUT(img, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT32}))
    .INPUT(map_offset, TensorType({DT_FLOAT32}))
    .OUTPUT(map_img, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT32}))
    .OP_END_FACTORY_REG(Remap)

 /**
 *@brief Resizes "images" with "offset" using bilinear interpolation. \n

 *@par Inputs:
 *@li img: input image, A 5-D tensor of shape `[n, 4, c, h, w]`,
 and 4 mean input[(h_top, w_left), (h_top, w_right), (h_bottom, w_left),  (h_bottom, w_right)].
 *@li warp_index: the resize offset A 4-D float tensor of shape `[n, 2, h, w]`, 2 means (x, y) for resize point.

 *@par Outputs:
 *remap_img: A Tensor after ResizeBilinear, A 4-D tensor of shape `[n, c, h, w]`. \n
 */
 REG_OP(IMGWarpResize)
    .INPUT(img, TensorType({DT_FLOAT32}))
    .INPUT(warp_index, TensorType({DT_FLOAT32}))
    .OUTPUT(warp_img, TensorType({DT_FLOAT32}))
    .OP_END_FACTORY_REG(IMGWarpResize)

 /**
 *@brief Function spatial transformer . \n

@@ -1342,6 +1558,383 @@ REG_OP(SpatialTransformerD)
    .ATTR(use_default_theta, ListBool, {})
    .OP_END_FACTORY_REG(SpatialTransformerD)

 }  // namespace ge
 /**
 * @brief Resize the input tensor. \n
 currently, only support resize image tensor using nearest neighbor and linear interpolation.

 * @par Inputs:
 * Input x must be a 4-D tensor. Inputs include: \n
 * @li x: A Tensor. Must be one of the following types: uint8, int8, int16, \n
 int32, int64, float16, float, double. 4-D with shape [batch, height, width, channels] \n
 or shape [batch, channels, height, width].
 * @li roi: A 1-D float Tensor. only takes effect when attr coordinate_transformation_mode \n
 is "tf_crop_and_resize"
 * @li scales: A 1-D float Tensor, the scale array along each dimension, Only one of \n
 'scales' and 'sizes' can be specified.
 * @li sizes: A 1-D int64 Tensor, The size of the output tensor. nly one of \n
 'scales' and 'sizes' can be specified.  If 'size' is specified, then set scales \n
 to empty data (zero shape) in this operator's input list.

 * @par Attributes:
 * @li coordinate_transformation_mode: String. Defaults to half_pixel. how to transform \n
 the coordinate in the resized tensor to the coordinate in the original tensor. \n
 other optional: pytorch_half_pixel, align_corners, asymmetric, tf_half_pixel_for_nn, \n
 tf_crop_and_resize.
 * @li cubic_coeff_a: Float. Defaults to -0.75, only used in cubic interpolation. \n
 other optional: -0.5
 * @li exclude_outside: Int. Defaults to 0, If set to 1, the weight of sampling \n
 locations outside the tensor will be set to 0 and the weight will be renormalized \n
 so that their sum is 1.0.
 * @li extrapolation_value: Float. Defaults to 0.0f. When coordinate_transformation_mode \n
 is "tf_crop_and_resize" and x_original is outside the range [0, length_original - 1], \n
 this value is used as the corresponding output value.
 * @li mode: String. Defaults to nearest. Three interpolation modes: nearest (default), \n
 linear and cubic.
 * @li nearest_mode: String. Defaults to round_prefer_floor. Four modes: round_prefer_floor, \n
 round_prefer_ceil, floor, ceil. Only used by nearest interpolation.

 * @par Outputs:
 * y: A Tensor. Has the same type as x.

 * @attention Constraints: \n
 * Input x must be a 4-D tensor.

 * @par Third-party framework compatibility
 * Compatible with tensorflow ResizeNearestNeighborV2 operator.
 */

 REG_OP(Resize)
    .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32,
                                DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(roi, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(scales, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(sizes, TensorType({DT_INT64}))
    .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32,
                                DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .ATTR(coordinate_transformation_mode, String, "half_pixel")
    .ATTR(cubic_coeff_a, Float, -0.75)
    .ATTR(exclude_outside, Int, 0)
    .ATTR(extrapolation_value, Float, 0)
    .ATTR(mode, String, "nearest")
    .ATTR(nearest_mode, String, "round_prefer_floor")
    .OP_END_FACTORY_REG(Resize)

 /**
 *@brief Function parse image from string to int. \n

 *@par Inputs:
 *@li contents: A Tensor of type string. 0-D. The JPEG-encoded image. \n

 *@par Attributes:
 *@li channels: An optional int. Defaults to 0. Number of color channels for the decoded image.
 *@li ratio: An optional int. Defaults to 1. Downscaling ratio.
 *@li fancy_upscaling: An optional bool. Defaults to True. If true use a slower but nicer upscaling of the chroma planes
 *@li try_recover_truncated: An optional bool. Defaults to False. If true try to recover an image from truncated input.
 *@li acceptable_fraction: An optional float. Defaults to 1. The minimum required fraction of lines before a truncated input is accepted.
 *@li dct_method: An optional string. Defaults to "". string specifying a hint about the algorithm used for decompression. \n

 *@par Outputs:
 *image: A Tensor dtype of uint8.
 */
 REG_OP(DecodeJpeg)
    .INPUT(contents, TensorType({DT_STRING}))
    .OUTPUT(image, TensorType({DT_UINT8}))
    .ATTR(channels, Int, 0)
    .ATTR(ratio, Int, 1)
    .ATTR(fancy_upscaling, Bool, true)
    .ATTR(try_recover_truncated, Bool, false)
    .ATTR(acceptable_fraction, Float, 1.0)
    .ATTR(dct_method, String, "")
    .OP_END_FACTORY_REG(DecodeJpeg)

 /**
 *@brief Image warping using per-pixel flow vectors. \n

 *@par Inputs:
 *@li image: 4-D Tensor with shape `[batch, height, width, channels]`.
 *@li flow: 4-D Tensor with shape `[batch, height, width, 2]`. \n

 *@par Outputs:
 *y: Returns 4-D with the same shape and dtype as `image`. \n
 */
 REG_OP(DenseImageWarp)
    .INPUT(image, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(flow, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DenseImageWarp)

 /**
 *@brief Calculate the resize_d function. \n

 *@par Inputs:
 *One inputs, including:
 * @li x: A tensor. Must be one of the following types:
 *     float16, float32. \n

 *@par Attributes:
 *@li sizes: An optional listInt. \n
 *@li scales: An optional listFloat.
    Defaults to none. \n
 *@li roi: An optional listInt.
    Defaults to none. \n
 *@li coordinate_transformation_mode: An optional String.
    Defaults to "half_pixel". \n
 *@li cubic_coeff_a: An optional float.
    Defaults to -0.75. \n
 *@li exclude_outside: An optional int.
    Defaults to 0. \n
 *@li extrapolation_value: An optional float.
    Defaults to 0.0. \n
 *@li mode: An optional String.
    Defaults to "nearest". \n
 *@li nearest_mode: An optional String.
    Defaults to "round_prefer_floor". \n

 *@par Outputs:
 *y: A Tensor with the same type of x's,
    shape depends on x and sizes. \n
 */
 REG_OP(ResizeD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(sizes, ListInt)
    .ATTR(scales, ListFloat, {})
    .ATTR(roi, ListInt, {})
    .ATTR(coordinate_transformation_mode, String, "half_pixel")
    .ATTR(cubic_coeff_a, Float, -0.75)
    .ATTR(exclude_outside, Int, 0)
    .ATTR(extrapolation_value, Float, 0.0)
    .ATTR(mode, String, "nearest")
    .ATTR(nearest_mode, String, "round_prefer_floor")
    .OP_END_FACTORY_REG(ResizeD)

 /**
 *@brief Calculate the resize_grad_d function. \n

 *@par Inputs:
 *One inputs, including:
 * @li grads: A tensor. Must be one of the following types:
 *     float16, float32. \n

 *@par Attributes:
 *@li original_size: An optional listInt. \n
 *@li roi: An optional listInt.
    Defaults to none. \n
 *@li scales: An optional listFloat.
    Defaults to none. \n
 *@li coordinate_transformation_mode: An optional String.
    Defaults to "half_pixel". \n
 *@li cubic_coeff_a: An optional float.
    Defaults to -0.75. \n
 *@li exclude_outside: An optional int.
    Defaults to 0. \n
 *@li extrapolation_value: An optional float.
    Defaults to 0.0. \n
 *@li mode: An optional String.
    Defaults to "nearest". \n
 *@li nearest_mode: An optional String.
    Defaults to "round_prefer_floor". \n

 *@par Outputs:
 *y: A Tensor with the same type of x's,
    shape depends on x and sizes. \n
 */
 REG_OP(ResizeGradD)
    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(original_size, ListInt)
    .ATTR(roi, ListInt, {})
    .ATTR(scales, ListFloat, {})
    .ATTR(coordinate_transformation_mode, String, "half_pixel")
    .ATTR(cubic_coeff_a, Float, -0.75)
    .ATTR(exclude_outside, Int, 0)
    .ATTR(extrapolation_value, Float, 0.0)
    .ATTR(mode, String, "nearest")
    .ATTR(nearest_mode, String, "round_prefer_floor")
    .OP_END_FACTORY_REG(ResizeGradD)

 /**
 *@brief Computes the gradients of DenseImageWarp with respect to image and flow. \n

 *@par Inputs:
 *@li grad: gradients with respect to DenseImageWarp output.
 *@li image: 4-D Tensor with shape `[batch, height, width, channels]`.
 *@li flow: 4-D Tensor with shape `[batch, height, width, 2]`. \n

 *@par Outputs:
 *grad_image: Returns 4-D with the same shape and dtype as `image`.
 *grad_flow: Returns 4-D with the same shape and dtype as `flow`. \n
 */
 REG_OP(DenseImageWarpGrad)
    .INPUT(grad, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(image, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(flow, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(grad_image, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(grad_flow, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DenseImageWarpGrad)

 /**
 *@brief This operation samples input X by using interpolation based on flow field grid,
 which is usually gennerated by affine_grid. The grid of shape [N, H, W, 2] is the concatenation of
 (x, y) coordinates with shape [N, H, W] each, where x is indexing the 4th dimension (in width dimension) of
 input data x and y is indexng the 3rd dimention (in height dimension), finally results is
 the interpolation value of 4 nearest corner points. The output tensor shape will be [N, C, H, W].

 *@par Inputs:
 *@li x: 4-D Tensor with shape `[batch, channels, height, width]`.
 *@li grid: flow field grid, 4-D Tensor with shape `[batch, height, width, 2]`.

 *@par Attributes:
 *@li interpolation_mode: An optional string specifying the interpolation method. Only 'bilinear' is
 supported for now .
 *@li padding_mode: An optional string specifying the pad method. Only 'zeros' is supported for now .
 *@li align_corners: An optional bool. If "true", the centers of the corner
 pixels of the input and output tensors are aligned. Defaults to "false" .

 *@par Outputs:
 *y: Returns 4-D Tensor with the same dtype as `X`.

 *@par Third-party framework compatibility
 *Compatible with pytorch GridSampler2D operator.

 *@par Restrictions:
 *Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(GridSampler2D)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(grid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(interpolation_mode, String, "bilinear")
    .ATTR(padding_mode, String, "zeros")
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(GridSampler2D)

 /**
 *@brief This operation unnormalize input Grid, which is usually gennerated by affine_grid.

 *@par Inputs:
 *@li grid: flow field grid, 4-D Tensor with shape `[batch, height, width, 2]`.
 *@li assist: Assist matrix, a 4-D tensor of type float16.

 *@par Attributes:
 *@li align_corners: An optional bool. If "true", the centers of the corner
 pixels of the input and output tensors are aligned. Defaults to "false" .

 *@par Outputs:
 *diff: Returns 4-D Tensor with the same shape and dtype as `grid`.
 *position: Returns 4-D Tensor with the same shape as `grid`.
 */
 REG_OP(GridUnnormal)
    .INPUT(grid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(assist, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(diff, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(position, TensorType({DT_INT32}))
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(GridUnnormal)

 /**
 *@brief This operation unfold input X based on unnormalized grid, which is gennerated by GridUnnormal.

 *@par Inputs:
 *@li x: 4-D Tensor with shape `[batch, channels, height, width]`.
 *@li position: 4-D Tensor with shape `[batch, output_height, output_width, 2]`.

 *@par Attributes:
 *@li padding_mode: An optional string specifying the pad method. Only 'zeros' is supported for now .

 *@par Outputs:
 *y: Returns 4-D Tensor with the same dtype as `x`.
 */
 REG_OP(ImageUnfold)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(position, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(padding_mode, String, "zeros")
    .OP_END_FACTORY_REG(ImageUnfold)
 	
 /**
 *@brief This operation select images to warp_images according to offsets.

 *@par Inputs:
 *@li images: 4-D Tensor with shape `[batch, height, width, 3]`.
 *@li offsets: 4-D Tensor with shape `[batch, 4, new_height, new_width]`.

 *@par Outputs:
 *warp_images: Returns 5-D Tensor with shape
 `[batch, 4, new_height, new_width, 3]` and the same dtype as `images`.
 */
 REG_OP(IMGWarpOffsets)
    .INPUT(images, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT}))
    .INPUT(offsets, TensorType({DT_FLOAT, DT_INT32}))
    .OUTPUT(warp_images, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(IMGWarpOffsets)

 /**
 *@brief This operation samples 3d input x by using interpolation based on flow field grid,
 which is usually gennerated by affine_grid.

 *@par Inputs:
 *@li x: 5-D Tensor with shape `[batch, channels, depth, height, width]`.
 *@li grid: flow field grid, 5-D Tensor with shape `[batch, depth, height, width, 2]`.

 *@par Attributes:
 *@li interpolation_mode: An optional string specifying the interpolation method.
 *@li padding_mode: An optional string specifying the pad method.
 *@li align_corners: An optional bool. If "true", the centers of the corner
 pixels of the input and output tensors are aligned. Defaults to "false" .

 *@par Outputs:
 *y: Returns 5-D Tensor with the same dtype as `x`.

 *@par Third-party framework compatibility
 *Compatible with pytorch GridSampler3D operator.

 *@par Restrictions:
 *Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(GridSampler3D)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(grid, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .ATTR(interpolation_mode, String, "bilinear")
    .ATTR(padding_mode, String, "zeros")
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(GridSampler3D)

 /**
 *@brief Computes the gradients of GridSampler3D.

 *@par Inputs:
 *@li grad: 5-D Tensor with shape `[batch, channels, depth, height, width]`.
 *@li x: 5-D Tensor with shape `[batch, channels, depth, height, width]`.
 *@li grid: flow field grid, 5-D Tensor with shape `[batch, depth, height, width, 2]`.

 *@par Attributes:
 *@li interpolation_mode: An optional string specifying the interpolation method.
 *@li padding_mode: An optional string specifying the pad method.
 *@li align_corners: An optional bool. If "true", the centers of the corner
 pixels of the input and output tensors are aligned. Defaults to "false" .

 *@par Outputs:
 *dx: Returns 5-D Tensor with the same dtype and shape as `x`.
 *dgrid: Returns 5-D Tensor with the same dtype and shape as `grid`.

 *@par Third-party framework compatibility
 *Compatible with pytorch GridSampler3DGrad operator.

 *@par Restrictions:
 *Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(GridSampler3DGrad)
    .INPUT(grad, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .INPUT(grid, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(dx, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(dgrid, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .ATTR(interpolation_mode, String, "bilinear")
    .ATTR(padding_mode, String, "zeros")
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(GridSampler3DGrad)

 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
--- a/third_party/fwkacllib/inc/ops/internal_ops.h
+++ b/third_party/fwkacllib/inc/ops/internal_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/linalg_ops.h
+++ b/third_party/fwkacllib/inc/ops/linalg_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,8 +61,8 @@ REG_OP(CholeskyGrad)

 *@par Inputs:
 *The input x has to be symmetric and positive definite.Inputs include:
 *x:A Tensor. Must be one of the following types: double, float32. Shape
 is [..., M, M] . \n
 *x:A Tensor. Must be one of the following types: double, float32, float16,
 complex64, complex128. Shape is [..., M, M] . \n

 *@par Outputs:
 *y:A Tensor. Has the same type as x . \n
@@ -76,10 +76,31 @@ form square matrices.
 */

 REG_OP(Cholesky)
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE, \
        DT_FLOAT16, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, \
        DT_FLOAT16, DT_COMPLEX64, DT_COMPLEX128}))
    .OP_END_FACTORY_REG(Cholesky)

 /**
 *@brief Computes the outer product of two 1D vectors . \n

 *@par Inputs:
 *The input x1 and x2 has to be a 1D vector.Inputs include:
 *@li x1:A Tensor. Must be one of the following types: float16, float32. 
 Shape is [N] . \n
 *@li x2:A Tensor. Must have the same type as x. Shape is [M] . \n

 *@par Outputs:
 *y:A Tensor. Has the same type as x . \n
 */

 REG_OP(Ger)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(Ger)

 /**
 *@brief Computes the sign and the log of the absolute value of the determinant
 of one or more square matrices . \n
@@ -87,8 +108,8 @@ of one or more square matrices . \n
 *@par Inputs:
 *The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions
 form square matrices. Inputs include:
 *x:A Tensor. Must be one of the following types: double, float32. Shape is
 [..., M, M] . \n
 *x:A Tensor. Must be one of the following types: double, float32,
 complex64, complex128. Shape is [..., M, M] . \n

 *@par Outputs:
 *@li y:A Tensor. Has the same type as x.
@@ -103,9 +124,9 @@ form square matrices. \n
 */

 REG_OP(LogMatrixDeterminant)
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(sign, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(sign, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OP_END_FACTORY_REG(LogMatrixDeterminant)

 /**
@@ -114,8 +135,8 @@ REG_OP(LogMatrixDeterminant)
 *@par Inputs:
 *The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions
 form square matrices. Inputs include:
 *x:A Tensor. Must be one of the following types: double, float32. Shape is
 [..., M, M] . \n
 *x:A Tensor. Must be one of the following types: double, float32, complex64,
 complex128. Shape is [..., M, M] . \n

 *@par Outputs:
 *y:A Tensor. Has the same type as x . \n
@@ -129,8 +150,8 @@ form square matrices.
 */

 REG_OP(MatrixDeterminant)
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OP_END_FACTORY_REG(MatrixDeterminant)

 /**
@@ -140,8 +161,7 @@ their adjoints (conjugate transposes) . \n
 *@par Inputs:
 *The input x is a tensor of shape [..., M, M] whose inner-most 2 dimensions
 form square matrices. Inputs include:
 *x:A Tensor. Must be one of the following types: double, float. Shape is
 [..., M, M] . \n
 *x:A Tensor of input. Shape is [..., M, M] . \n

 *@par Attributes:
 *adjoint:An optional bool. Defaults to False.Boolean indicating whether to
@@ -159,8 +179,8 @@ form square matrices.  \n
 */

 REG_OP(MatrixInverse)
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .ATTR(adjoint, Bool, false)
    .OP_END_FACTORY_REG(MatrixInverse)

@@ -169,8 +189,7 @@ REG_OP(MatrixInverse)

 *@par Inputs:
 *The input rhs must have the same type as matrix. Inputs include:
 *@li matrix:A Tensor. Must be one of the following types: double, float.
 Shape is [..., M, M].
 *@li matrix:A Tensor of input. Shape is [..., M, M].
 *@li rhs:A Tensor. Must have the same type as matrix. Shape is [..., M, K] . \n

 *@par Attributes:
@@ -189,9 +208,9 @@ dimensions form square matrices.  \n
 */

 REG_OP(MatrixSolve)
    .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .ATTR(adjoint, Bool, false)
    .OP_END_FACTORY_REG(MatrixSolve)

@@ -221,8 +240,8 @@ dimensions form square matrices.  \n
 */

 REG_OP(MatrixSolveLs)
    .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .INPUT(l2, TensorType({DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(fast, Bool, true)
@@ -234,8 +253,7 @@ matrices by backsubstitution . \n

 *@par Inputs:
 *The input rhs must have the same type as matrix. Inputs include:
 *@li matrix: A Tensor. Must be one of the following types: double, float.
 Shape is [..., M, M].
 *@li matrix: A Tensor. Shape is [..., M, M].
 *@li rhs:A Tensor. Must have the same type as matrix. Shape is [..., M, K] . \n

 *@par Attributes:
@@ -256,9 +274,9 @@ dimensions form square matrices.  \n
 */

 REG_OP(MatrixTriangularSolve)
    .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(matrix, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .ATTR(lower, Bool, true)
    .ATTR(adjoint, Bool, false)
    .OP_END_FACTORY_REG(MatrixTriangularSolve)
@@ -268,8 +286,7 @@ REG_OP(MatrixTriangularSolve)

 *@par Inputs:
 *The input shape of x must be [..., M, N]. Inputs include:
 *x:A Tensor whose shape is [..., M, N]. Must be one of the following types:
 double, float . \n
 *x:A Tensor whose shape is [..., M, N]. \n

 *@par Attributes:
 *full_matrices: An optional bool. Defaults to False. If true, compute
@@ -289,9 +306,12 @@ dimensions form matrices of size [M, N].  \n
 */

 REG_OP(Qr)
    .INPUT(x, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE }))
    .OUTPUT(q, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE }))
    .OUTPUT(r, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE }))
    .INPUT(x, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \
        DT_COMPLEX64, DT_COMPLEX128 }))
    .OUTPUT(q, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \
        DT_COMPLEX64, DT_COMPLEX128 }))
    .OUTPUT(r, TensorType({ DT_FLOAT16, DT_FLOAT, DT_DOUBLE, \
        DT_COMPLEX64, DT_COMPLEX128 }))
    .ATTR(full_matrices, Bool, false)
    .OP_END_FACTORY_REG(Qr)

@@ -320,12 +340,40 @@ form square matrices.   \n
 */

 REG_OP(SelfAdjointEig)
    .INPUT(x, TensorType({ DT_DOUBLE, DT_FLOAT }))
    .OUTPUT(eigen_value, TensorType({ DT_DOUBLE, DT_FLOAT }))
    .OUTPUT(eigen_vector, TensorType({ DT_DOUBLE, DT_FLOAT }))
    .INPUT(x, TensorType({ DT_DOUBLE, DT_FLOAT, DT_COMPLEX64, DT_COMPLEX128 }))
    .OUTPUT(eigen_value, TensorType({ DT_DOUBLE, DT_FLOAT, DT_COMPLEX64, DT_COMPLEX128 }))
    .OUTPUT(eigen_vector, TensorType({ DT_DOUBLE, DT_FLOAT, DT_COMPLEX64, DT_COMPLEX128 }))
    .ATTR(compute_v, Bool, true)
    .OP_END_FACTORY_REG(SelfAdjointEig)

 /**
 *@brief Computes the sign and the log of the absolute value of the determinant
 of one or more square matrices . \n

 *@par Inputs:
 *The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions
 form square matrices. Inputs include:
 *x:A Tensor. Must be one of the following types: double, float32, float16
 Shape is [..., M, M] . \n

 *@par Outputs:
 *@li y:A Tensor. Has the same type as x.
 *@li sign:A Tensor. Has the same type as x . \n

 *@attention Constraints:
 *The input x is a tensor of shape [N, M, M] whose inner-most 2 dimensions
 form square matrices. \n

 *@par Third-party framework compatibility
 *Compatible with tensorflow LogMatrixDeterminant operator.
 */

 REG_OP(Slogdet)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(sign, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(Slogdet)

 /**
 *@brief Computes the singular value decompositions of one or more matrices . \n

@@ -384,8 +432,8 @@ of the rows encoded as a list of indices in `0..M-1`. Shape is `[..., M]` . \n
 */

 REG_OP(Lu)
    .INPUT(input, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(lu, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(input, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(lu, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(p, TensorType({DT_INT32, DT_INT64}))
    .REQUIRED_ATTR(output_idx_type, Type)
    .OP_END_FACTORY_REG(Lu)
@@ -404,8 +452,8 @@ y: Shape is `[..., M, M]` . \n
 */

 REG_OP(MatrixSquareRoot)
    .INPUT(input, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(input, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OP_END_FACTORY_REG(MatrixSquareRoot)

 /**
@@ -424,9 +472,9 @@ y: Tensor of shape `[..., M, K]` containing the solutions \n
 */

 REG_OP(TridiagonalSolve)
    .INPUT(diagonals, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(diagonals, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .INPUT(rhs, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}))
    .ATTR(partial_pivoting, Bool, true)
    .OP_END_FACTORY_REG(TridiagonalSolve)

--- a/third_party/fwkacllib/inc/ops/list_ops.h
+++ b/third_party/fwkacllib/inc/ops/list_ops.h
@@ -0,0 +1,504 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 /*!
 * \file list_ops.h
 * \brief
 */
 #ifndef OPS_BUILT_IN_OP_PROTO_INC_LIST_OPS_H_
 #define OPS_BUILT_IN_OP_PROTO_INC_LIST_OPS_H_

 #include <algorithm>
 #include "graph/operator_reg.h"
 #include "graph/operator.h"

 namespace ge {

 /**
 *@brief Creates and returns an empty tensor list. \n

 *@par Inputs:
 *@li element_shape: A shape compatible with that of elements in the list.
 *@li max_num_elements: The maximum number of elements. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li handle: An empty tensor list . \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow EmptyTensorList operator.
 */
 REG_OP(EmptyTensorList)
    .INPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .INPUT(max_num_elements, TensorType({DT_INT32}))
    .OUTPUT(handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(EmptyTensorList)

 /**
 *@brief Returns a list which has the passed-in `Tensor` as last element
 and the other elements of the given list in `input_handle`. \n

 *@par Inputs:
 *@li input_handle: The old list.
 *@li tensor: The tensor to put on the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle:A list with the elements of old list followed by tensor. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListPushBack operator.
 */
 REG_OP(TensorListPushBack)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,DT_RESOURCE,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListPushBack)

 /**
 *@brief The last element of the input list as well as a
 list with all but that element. \n

 *@par Inputs:
 *@li input_handle: The input list.
 *@li element_shape: A shape compatible with that of elements in the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle:A list with the elements of the old list followed by tensor.
 *@li tensor:The withdrawn last element of the list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListPopBack operator.
 */
 REG_OP(TensorListPopBack)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(element_shape, TensorType({DT_INT32}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .OUTPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,DT_RESOURCE,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListPopBack)

 /**
 *@brief The number of tensors in the input tensor list. \n

 *@par Inputs:
 *@li input_handle: The input list. \n

 *@par Outputs:
 *@li length:The number of tensors in the list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListLength operator.
 */
 REG_OP(TensorListLength)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .OUTPUT(length, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(TensorListLength)

 /**
 *@brief The shape of elements in the input tensor list. \n

 *@par Inputs:
 *@li input_handle: The input list. \n

 *@par Attributes:
 *@li shape_type: The type of shape in the list. \n

 *@par Outputs:
 *@li element_shape:A shape compatible with that of elements in the list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListElementShape operator.
 */
 REG_OP(TensorListElementShape)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .OUTPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .ATTR(shape_type, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListElementShape)

 /**
 *@brief List of the given size with empty elements. \n

 *@par Inputs:
 *@li element_shape: A shape compatible with that of elements in the list.
 *@li num_elements: The number of elements to reserve. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list.
 *@li shape_type: The type of shape in the list. \n

 *@par Outputs:
 *@li handle: An output tensor list . \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListReserve operator.
 */
 REG_OP(TensorListReserve)
    .INPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .INPUT(num_elements, TensorType({DT_INT32}))
    .OUTPUT(handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .ATTR(shape_type, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListReserve)

 /**
 *@brief Get input tensor list elements of index position. \n

 *@par Inputs:
 *@li input_handle: The input list.
 *@li index: A tensor of position.
 *@li element_shape: A shape compatible with that of elements in the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li item: An output tensor value of index position . \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListGetItem operator.
 */
 REG_OP(TensorListGetItem)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(index, TensorType({DT_INT32}))
    .INPUT(element_shape, TensorType({DT_INT32}))
    .OUTPUT(item, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListGetItem)

 /**
 *@brief Sets the index-th position of the list to contain the given tensor. \n

 *@par Inputs:
 *@li input_handle: The input list.
 *@li index: The position in the list to which the tensor will be assigned.
 *@li item: The element to be assigned to that position. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle: An output tensor list . \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListSetItem operator.
 */
 REG_OP(TensorListSetItem)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(index, TensorType({DT_INT32}))
    .INPUT(item, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,DT_RESOURCE,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListSetItem)

 /**
 *@brief Push tensor to list. \n

 *@par Inputs:
 *@li input_handles: The input tensor lists.
 *@li tensor: The tensor push into tensor list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handles: The output tensor lists. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListPushBackBatch operator.
 */
 REG_OP(TensorListPushBackBatch)
    .INPUT(input_handles, TensorType({DT_VARIANT}))
    .INPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(output_handles, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListPushBackBatch)

 /**
 *@brief Stacks all tensors in the list. \n

 *@par Inputs:
 *@li input_handle: The input tensor list.
 *@li element_shape: A shape compatible with that of elements in the tensor. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list.
 *@li num_elements: The number of elements in the list. \n

 *@par Outputs:
 *@li tensor: The tensor of list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListStack operator.
 */
 REG_OP(TensorListStack)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(element_shape, TensorType({DT_INT32}))
    .OUTPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .ATTR(element_dtype, Type, DT_INT32)
    .ATTR(num_elements, Int, -1)
    .OP_END_FACTORY_REG(TensorListStack)

 /**
 *@brief Concats all tensors in the list along the 0th dimension.
 Requires that all tensors have the same shape except the first dimension. \n

 *@par Inputs:
 *@li input_handle: The input list.
 *@li element_shape: The shape of the uninitialized elements in the list.
 If the first dimension is not -1, it is assumed that all list elements have
 the same leading dim.
 *@li leading_dims: The list of leading dims of uninitialized list elements. Used if
 the leading dim of input_handle.element_shape or the element_shape input arg
 is not already set. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li tensor: The concated result.
 *@li lengths: Output tensor containing sizes of the 0th dimension of tensors
 in the list, used for computing the gradient. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListConcatV2 operator.
 */
 REG_OP(TensorListConcatV2)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .INPUT(leading_dims, TensorType({DT_INT64}))
    .OUTPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(lengths, TensorType({DT_INT64}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListConcatV2)

 /**
 *@brief Splits a tensor into a list. \n

 *@par Inputs:
 *@li tensor: The input tensor.
 *@li element_shape: A shape compatible with that of elements in the tensor.
 *@li lengths: Vector of sizes of the 0th dimension of tensors in the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle: The list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListSplit operator.
 */
 REG_OP(TensorListSplit)
    .INPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .INPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .INPUT(lengths, TensorType({DT_INT64}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListSplit)

 /**
 *@brief Creates a TensorList which, when stacked, has the value of `tensor`. \n

 *@par Inputs:
 *@li tensor: The input tensor.
 *@li element_shape: The shape of elements in the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle: An output tensor list . \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListFromTensor operator.
 */
 REG_OP(TensorListFromTensor)
    .INPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .INPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListFromTensor)

 /**
 *@brief Resizes the list. \n

 *@par Inputs:
 *@li input_handle: The input tensor list.
 *@li size: size of the output list. \n

 *@par Outputs:
 *@li output_handle: The output tensor list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListResize operator.
 */
 REG_OP(TensorListResize)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(size, TensorType({DT_INT32}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .OP_END_FACTORY_REG(TensorListResize)

 /**
 *@brief Creates a Tensor by indexing into the TensorList. \n

 *@par Inputs:
 *@li input_handle: The input tensor list.
 *@li indices: The indices used to index into the list.
 *@li element_shape: The shape of elements in the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li values: The tensor. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListGather operator.
 */
 REG_OP(TensorListGather)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(element_shape, TensorType({DT_INT32}))
    .OUTPUT(values, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListGather)

 /**
 *@brief Creates a TensorList by indexing into a Tensor. \n

 *@par Inputs:
 *@li tensor: The input tensor.
 *@li indices: The indices used to index into the list.
 *@li element_shape: The shape of the elements in the list (can be less specified than
 the shape of the tensor).
 *@li num_elements: The size of the output list. Must be large enough to accommodate
 the largest index in indices. If -1, the list is just large enough to include
 the largest index in indices. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle: The TensorList. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListScatterV2 operator.
 */
 REG_OP(TensorListScatterV2)
    .INPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(element_shape, TensorType({DT_INT32,DT_INT64}))
    .INPUT(num_elements, TensorType({DT_INT32}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListScatterV2)

 /**
 *@brief Scatters tensor at indices in an input list. \n

 *@par Inputs:
 *@li input_handle: The input tensor list.
 *@li tensor: The input tensor.
 *@li indices: The indices used to index into the list. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output_handle: The TensorList. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListScatterIntoExistingList operator.
 */
 REG_OP(TensorListScatterIntoExistingList)
    .INPUT(input_handle, TensorType({DT_VARIANT}))
    .INPUT(tensor, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT8,
        DT_INT16,DT_INT32,DT_INT64,DT_UINT8,DT_UINT16,DT_QINT8,DT_QUINT8,
        DT_QINT16,DT_QUINT16,DT_QINT32,DT_BOOL,
        DT_STRING,DT_COMPLEX64,DT_COMPLEX128}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(output_handle, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListScatterIntoExistingList)

 /**
 *@brief Concat two tensor lists to a new tensor list. \n

 *@par Inputs:
 *@li input_a: The input tensor list A.
 *@li input_b: The input tensor list B. \n

 *@par Attributes:
 *@li element_dtype: The type of elements in the list. \n

 *@par Outputs:
 *@li output: The output list. \n

 *@par Third-party framework compatibility.
 *Compatible with tensorflow TensorListConcatLists operator.
 */
 REG_OP(TensorListConcatLists)
    .INPUT(input_a, TensorType({DT_VARIANT}))
    .INPUT(input_b, TensorType({DT_VARIANT}))
    .OUTPUT(output, TensorType({DT_VARIANT}))
    .ATTR(element_dtype, Type, DT_INT32)
    .OP_END_FACTORY_REG(TensorListConcatLists)
 }   // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_LIST_OPS_H_
--- a/third_party/fwkacllib/inc/ops/logging_ops.h
+++ b/third_party/fwkacllib/inc/ops/logging_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/lookup_ops.h
+++ b/third_party/fwkacllib/inc/ops/lookup_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -222,6 +222,24 @@ REG_OP(Bucketize)
    .REQUIRED_ATTR(boundaries, ListFloat)
    .OP_END_FACTORY_REG(Bucketize)

 /**
 *@brief Returns a new tensor with the truncated integer values of the elements of input. \n

 *@par Inputs:
 *One inputs, including:
 *   @li input_x: A tensor. Must be one of the following types: float16, float32, int8, uint8, int32. \n

 *@par Outputs:
 *y: A tensor with the same type and shape of input_x \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Trunc. \n
 */
 REG_OP(Trunc)
    .INPUT(input_x, TensorType({DT_FLOAT16,DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8}))
    .OUTPUT(output_y, TensorType({DT_FLOAT16,DT_FLOAT, DT_INT8, DT_INT32, DT_UINT8}))
    .OP_END_FACTORY_REG(Trunc)
 	
 /**
 *@brief Computes the sum along sparse segments of a tensor . \n

@@ -365,6 +383,27 @@ REG_OP(GetNext)
    .ATTR(channel_name, String, "")
    .OP_END_FACTORY_REG(GetNext)

 /**
 *@brief Get dynamic dims after GetNext. \n

 *@par Inputs:
 *input: A nested structure of Tensor objects, from GetNext's output. \n

 *@par Attributes:
 *@li shape_info: GE shape_info for each inputs, -1 means unknow dim.
 *@li N: Inputs number. \n

 *@par Outputs:
 *dims: GE unknow dims, a vector of int64. \n
 */

 REG_OP(GetDynamicDims)
    .DYNAMIC_INPUT(input, TensorType({DT_INT32, DT_INT64}))
    .OUTPUT(dims, TensorType({DT_INT32, DT_INT64}))
    .REQUIRED_ATTR(shape_info, ListInt)
    .REQUIRED_ATTR(N, Int)
    .OP_END_FACTORY_REG(GetDynamicDims)

 /**
 *@brief End of sequence . \n

@@ -494,6 +533,29 @@ REG_OP(NextAfter)
    .OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OP_END_FACTORY_REG(NextAfter)

 /**
 *@brief Calculate the P-norm distance between vectors  function. \n

 *@par Inputs:
 *One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n

 *@par Attributes:
 *@li  p: An optional float.Defaults to 2. \n

 *@par Outputs:
 *y: A Tensor with the same type and shape of input_x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Pdist. \n
 */
 REG_OP(Pdist)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(p, Float, 2.0)
    .OP_END_FACTORY_REG(Pdist)

 /**
 *@brief Compute element-wise finiteness, return a boolean tensor.

@@ -624,6 +686,7 @@ REG_OP(NLLLoss)
    .OUTPUT(y, TensorType({DT_FLOAT}))
    .OUTPUT(total_weight, TensorType({DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .ATTR(ignore_index, Int, -100)
    .OP_END_FACTORY_REG(NLLLoss)

 /**
@@ -653,6 +716,7 @@ REG_OP(NLLLossGrad)
    .INPUT(total_weight, TensorType({DT_FLOAT}))
    .OUTPUT(x_grad, TensorType({DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .ATTR(ignore_index, Int, -100)
    .OP_END_FACTORY_REG(NLLLossGrad)

 /**
@@ -710,6 +774,9 @@ REG_OP(IFMR)

 *@par Third-party framework compatibility
 *Compatible with mindspore

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */

 REG_OP(WtsARQ)
@@ -741,6 +808,9 @@ REG_OP(WtsARQ)

 *@par Third-party framework compatibility
 *Compatible with mindspore

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */

 REG_OP(ActsULQ)
@@ -748,8 +818,8 @@ REG_OP(ActsULQ)
  .INPUT(clamp_min, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(clamp_max, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(clamp_min_mask, TensorType({DT_BOOL}))
  .OUTPUT(clamp_max_mask, TensorType({DT_BOOL}))
  .OUTPUT(clamp_min_mask, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(clamp_max_mask, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(x_clamped_loss, TensorType({DT_FLOAT16, DT_FLOAT}))
  .ATTR(fixed_min, Bool, false)
  .ATTR(num_bits, Int, 8)
@@ -768,12 +838,15 @@ REG_OP(ActsULQ)

 *@par Third-party framework compatibility
 *Compatible with mindspore

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */

 REG_OP(ActsULQInputGrad)
  .INPUT(y_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(clamp_min_mask, TensorType({DT_BOOL}))
  .INPUT(clamp_max_mask, TensorType({DT_BOOL}))
  .INPUT(clamp_min_mask, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT}))
  .INPUT(clamp_max_mask, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(x_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OP_END_FACTORY_REG(ActsULQInputGrad)

@@ -790,11 +863,14 @@ REG_OP(ActsULQInputGrad)

 *@par Third-party framework compatibility
 *Compatible with mindspore

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */

 REG_OP(ActULQClampMaxGrad)
  .INPUT(y_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(clamp_max_mask, TensorType({DT_BOOL}))
  .INPUT(clamp_max_mask, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT}))
  .INPUT(x_clamped_loss, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(clamp_max_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OP_END_FACTORY_REG(ActULQClampMaxGrad)
@@ -812,15 +888,208 @@ REG_OP(ActULQClampMaxGrad)

 *@par Third-party framework compatibility
 *Compatible with mindspore

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */

 REG_OP(ActULQClampMinGrad)
  .INPUT(y_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(clamp_min_mask, TensorType({DT_BOOL}))
  .INPUT(clamp_min_mask, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT}))
  .INPUT(x_clamped_loss, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(clamp_min_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OP_END_FACTORY_REG(ActULQClampMinGrad)

 /**
 * @brief Computes Lp norm.

 * @par Inputs:
 * @li x: An ND tensor of type float16, float32. \n
 *
 * @par Attributes:
 * @li p: Int, "inf" or "-inf", default value is 2.
 * @li axes: ListInt, {} means all axes will be computed.
 * @li keepdim: Bool, default is false.
 * @li epsilon: Float, default is 1e-12. \n

 * @par Outputs:
 * @li y: An ND tensor of type float16, float32. The shape of y is depending
 * on axes and keepdim. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator LpNorm.
 */
 REG_OP(LpNorm)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(p, Int, 2)
    .ATTR(axes, ListInt, {})
    .ATTR(keepdim, Bool, false)
    .ATTR(epsilon, Float, 1e-12)
    .OP_END_FACTORY_REG(LpNorm)

 /**
 * @brief get complex.

 * @par Inputs:
 * @li real: An ND tensor of type  float32. double
 * @li imag: An ND tensor of type  float32. double \n
 *
 * @par Outputs:
 * @li out: An ND tensor of type complex64, complex128 \n
 */
 REG_OP(Complex)
    .INPUT(real, TensorType({DT_FLOAT, DT_DOUBLE}))
    .INPUT(imag, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(out, TensorType({DT_COMPLEX64, DT_COMPLEX128}))
    .ATTR(Tout, Type, DT_COMPLEX64)
    .OP_END_FACTORY_REG(Complex)

 /**
 * @brief  deal complex.

 * @par Inputs:
 * @li input: An ND tensor of type complex64, complex128 \n
 *
 * @par Outputs:
 * @li output: An ND tensor of type float32. double \n
 */
 REG_OP(Imag)
    .INPUT(input, TensorType({DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(Tout, Type, DT_FLOAT)
    .OP_END_FACTORY_REG(Imag)

 /**
 * @brief  deal complex.

 * @par Inputs:
 * @li input: An ND tensor of type complex64, complex128 \n
 *
 * @par Outputs:
 * @li output: An ND tensor of type float32. double \n
 */
 REG_OP(Angle)
    .INPUT(input, TensorType({DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(Tout, Type, DT_FLOAT)
    .OP_END_FACTORY_REG(Angle)

 /**
 *@brief Computes the gradient of SoftMarginLossGrad. \n

 *@par Inputs:
 *Three inputs, including:
 * @li predict: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * @li label: A tensor with same shape of predict. Must be one of the following types:
 *     float16, float32. \n
 * @li dout: A tensor with same shpae of predcit. Must be one of the following types:
 *     float16, float32. \n

 *@par Attributes:
 * @li reduction: Specifies the reduction to apply to the output:
 *     'none' | 'mean' | 'sum'. Default: 'mean'. \n

 *@par Outputs:
 * gradient: A Tensor with the same type of predict. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator SoftMarginLoss Backward. \n
 */
 REG_OP(SoftMarginLossGrad)
    .INPUT(predict, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(label, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(dout, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(gradient, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(SoftMarginLossGrad)

 /**
 *@brief Calculate the cross product of two tensors. \n

 *@par Inputs:
 *One inputs, including:
 * @li x1: A tensor. Must be one of the following types:
 *     float16, float32, int32, int8, uint8, int16. \n
 * @li x2: A tensor. Must be one of the following types:
 *     float16, float32, int32, int8, uint8, int16. \n

 *@par Attributes:
 *@li dim: the dimination of compute.Defaults to -65530. \n

 *@par Outputs:
 *y: A Tensor with the same type and shape of x1's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator cross. \n
 */
 REG_OP(Cross)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_INT16}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_INT16}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8, DT_INT16}))
    .ATTR(dim, Int, -65530)
    .OP_END_FACTORY_REG(Cross)

 /**
 *@brief Computes batched the p-norm distance between each pair of
 *the two collections of row vectors. \n

 *@par Inputs:
 *Two inputs, including:
 * @li x1: A tensor with shpae: BxPXM. Must be one of the following types:
 *     float16, float32. \n
 * @li x2: A tensor with shpae: BxRxM. Must be one of the following types:
 *     float16, float32. \n

 *@par Attributes:
 * @li p: An optional float >= 0 or inf. Defaults to 2.0. \n

 *@par Outputs:
 * y: A Tensor with the same type of x1's and with shape BxPxR. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Cdist. \n
 */
 REG_OP(Cdist)
    .INPUT(x1, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(x2, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(p, Float, 2.0)
    .OP_END_FACTORY_REG(Cdist)

 /**
 *@brief  Computes the grad of x1 in cdist. \n

 *@par Inputs:
 *Four inputs, including:
 * @li grad: Grad with shape BxPxR. Must be one of the following types:
 *     float16, float32. \n
 * @li x1: A tensor with shpae: BxPXM. Must be one of the following types:
 *     float16, float32. \n
 * @li x2: A tensor with shpae: BxRxM. Must be one of the following types:
 *     float16, float32. \n
 * @li cdist: Output tensor of cdist forward with shpae: BxPXR.
 *     Must be one of the following types: float16, float32. \n

 *@par Attributes:
 * @li p: An optional float >= 0 or inf. Defaults to 2.0. \n

 *@par Outputs:
 * y: A Tensor with the same type and shape of x1's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Cdist Backward. \n
 */
 REG_OP(CdistGrad)
    .INPUT(grad, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x1, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x2, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(cdist, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(p, Float, 2.0)
    .OP_END_FACTORY_REG(CdistGrad)

 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_MATH_OPS_H_
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,8 +38,8 @@ namespace ge {
 * float32, int32. Has format [ND, NHWC] . \n

 *@par Attributes:
 *@li transpose_a: A bool. If True, changes the shape of "x1" from [M, K] to [K, M].
 *@li transpose_b: A bool. If True, changes the shape of "x2" from [M, K] to [K, M] . \n
 *@li transpose_x1: A bool. If True, changes the shape of "x1" from [M, K] to [K, M].
 *@li transpose_x2: A bool. If True, changes the shape of "x2" from [M, K] to [K, M] . \n

 *@par Outputs:
 *y: The result matrix Tensor. 2D. Must be one of the following types: float16,
@@ -70,8 +70,8 @@ REG_OP(MatMul)
 * float32, int32. Has format [ND, NHWC] . \n

 *@par Attributes:
 *@li transpose_a: A bool. If True, changes the shape of "x1" from [M, K] to [K, M].
 *@li transpose_b: A bool. If True, changes the shape of "x2" from [M, K] to [K, M] . \n
 *@li transpose_x1: A bool. If True, changes the shape of "x1" from [M, K] to [K, M].
 *@li transpose_x2: A bool. If True, changes the shape of "x2" from [M, K] to [K, M] . \n

 *@par Outputs:
 *y: The result matrix Tensor. 2D. Must be one of the following types: float16,
@@ -91,6 +91,36 @@ REG_OP(MatMulV2)
    .ATTR(offset_x, Int, 0)
    .OP_END_FACTORY_REG(MatMulV2)

 /**
 *@brief Multiplies matrix "a" by matrix "b", producing "a * b" . \n

 *@par Inputs:
 *Two inputs, including:
 * @li x1: A matrix Tensor. 2D. Must be one of the following types: int8.
 * @li x2: A matrix Tensor. 2D. Must be one of the following types: int8.
 * @li compress_index: A compress index matrix of type int8.
 * @li bias: A 1D Tensor. Must be one of the following types: int32, float16.

 *@par Attributes:
 *@li transpose_x1: A bool. If True, changes the shape of "x1" from [M, K] to [K, M].
 *@li transpose_x2: A bool. If True, changes the shape of "x2" from [M, K] to [K, M] . \n

 *@par Outputs:
 *y: The result matrix Tensor. 2D. Must be one of the following types: float16,
 * int32. \n

 */
 REG_OP(MatMulV2Compress)
    .INPUT(x1, TensorType({DT_INT8}))
    .INPUT(x2, TensorType({DT_INT8}))
    .INPUT(compress_index, TensorType({DT_INT8}))
    .OPTIONAL_INPUT(bias, TensorType({DT_INT32, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_INT32, DT_FLOAT16}))
    .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
    .ATTR(transpose_x1, Bool, false)
    .ATTR(transpose_x2, Bool, false)
    .ATTR(offset_x, Int, 0)
    .OP_END_FACTORY_REG(MatMulV2Compress)

 /**
 *@brief Performs Matrix-to-matrix Multiply, producing c=alpha[0]*a*b+beta[0]*c . \n
@@ -149,15 +179,15 @@ REG_OP(GEMM)
 *@brief Multiplies matrix "a" by matrix "b", producing "a * b" . \n

 *@par Inputs:
 *Three inputs, including:
 *Two inputs, including:
 * @li x1: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ].
 * @li x2: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ] . \n

 *@par Attributes:
 *@li adj_x: A bool. If True, changes the shape of "x1" from [B, M, K] to [B, K, M].
 *@li adj_y: A bool. If True, changes the shape of "x2" from [B, M, K] to [B, K, M] . \n
 *@li adj_x1: A bool. If True, changes the shape of "x1" from [B, M, K] to [B, K, M].
 *@li adj_x2: A bool. If True, changes the shape of "x2" from [B, M, K] to [B, K, M] . \n

 *@par Outputs:
 *y: The result matrix Tensor. 2D or higher. Must be one of the following types: float16,
@@ -175,6 +205,42 @@ REG_OP(BatchMatMul)
    .ATTR(adj_x2, Bool, false)
    .OP_END_FACTORY_REG(BatchMatMul)


 /**
 * @brief Multiplies matrix "a" by matrix "b", producing "a * b" . \n

 * @par Inputs:
 * Three inputs, including:
 * @li x1: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ].
 * @li x2: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ] . \n
 * @li bias: A matrix Tensor. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ] . \n

 * @par Attributes:
 * @li adj_x1: A bool. If True, changes the shape of "x1" from [B, M, K] to [B, K, M].
 * @li adj_x2: A bool. If True, changes the shape of "x2" from [B, M, K] to [B, K, M] . \n

 * @par Outputs:
 * y: The result matrix Tensor. 2D or higher. Must be one of the following types: float16,
 * float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ]. Has the same shape length as "x1" and "x2" . \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchMatmul.
 */

 REG_OP(BatchMatMulV2)
    .INPUT(x1, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .INPUT(x2, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8}))
    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .ATTR(adj_x1, Bool, false)
    .ATTR(adj_x2, Bool, false)
    .ATTR(offset_x, Int, 0)
    .OP_END_FACTORY_REG(BatchMatMulV2)

 /**
 *@brief Computes half the L2 norm of a tensor without the sqrt . \n

@@ -334,7 +400,7 @@ REG_OP(MatrixSetDiagD)
 * int64, complex64, qint8, quint8, qint32, uint16, complex128, half, uint32,
 * uint64
 *@li indices: An ND Tensor.
 *Must be one of the following types: int32, int64
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor.
 *Must be one of the following types: float16, float32, int8, uint8, double,
 * int64, complex64, qint8, quint8, qint32, uint16, complex128, half, uint32,
@@ -378,6 +444,9 @@ REG_OP(ScatterNdUpdate)

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator TensorScatterUpdate.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(TensorScatterUpdate)
    .INPUT(x, TensorType::BasicType())
@@ -386,6 +455,34 @@ REG_OP(TensorScatterUpdate)
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(TensorScatterUpdate)

 /**
 *@brief Uses "updates" to update tensor "data" by "indices". \n

 *@par Inputs:
 * Three inputs, including:
 *@li data: An ND Tensor . \n
 *Must be one of the following types: float16, float32, int32, int8, uint8
 *@li indices: An ND Tensor of type int32 or int64
 *@li updates: An Tensor. Same shape as indices. format:NCHW, NHWC . \n
 *Must be one of the following types: float16, float32, int32, int8, uint8

 *@par Attributes:
 *@li axis: An optional attribute. Defaults to 0.

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "data" . \n

 *@par Third-party framework compatibility
 * Compatible with the ONNX operator ScatterElements.
 */
 REG_OP(ScatterElements)
    .INPUT(data, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(axis, Int, 0)
    .OP_END_FACTORY_REG(ScatterElements)

 /**
 *@brief Adds sparse "updates" to a variable reference . \n

@@ -394,7 +491,7 @@ REG_OP(TensorScatterUpdate)
 *@li var: An ND Tensor . \n

 *Must be one of the following types: float16, float32, int32, int8, uint8
 *@li indices: An ND Tensor of type int32 or int64.
 *@li indices: An ND Tensor of type int32 or int64


 *@li updates: An Tensor. format:NCHW, NHWC . \n
@@ -412,10 +509,10 @@ REG_OP(TensorScatterUpdate)
 * Compatible with the TensorFlow operator ScatterAdd.
 */
 REG_OP(ScatterAdd)
    .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterAdd)

@@ -428,7 +525,7 @@ REG_OP(ScatterAdd)
 *Must be one of the following types: float16, float, int32, int8, uint8

 *@li indices: An ND Tensor.
 *Must be one of the following types: int32
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8

@@ -443,10 +540,10 @@ REG_OP(ScatterAdd)
 * Compatible with the TensorFlow operator ScatterDiv.
 */
 REG_OP(ScatterDiv)
    .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterDiv)

@@ -458,7 +555,7 @@ REG_OP(ScatterDiv)
 *@li var: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@li indices: An ND Tensor.
 *Must be one of the following types: int32
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@par Attributes:
@@ -472,10 +569,10 @@ REG_OP(ScatterDiv)
 * Compatible with the TensorFlow operator ScatterNdAdd.
 */
 REG_OP(ScatterNdAdd)
    .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterNdAdd)

@@ -499,6 +596,9 @@ REG_OP(ScatterNdAdd)

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator TensorScatterAdd.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(TensorScatterAdd)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
@@ -515,7 +615,7 @@ REG_OP(TensorScatterAdd)
 *@li var: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@li indices: An ND Tensor.
 *Must be one of the following types: int32, int64
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8

@@ -530,10 +630,10 @@ REG_OP(TensorScatterAdd)
 * Compatible with the TensorFlow operator ScatterNdSub.
 */
 REG_OP(ScatterNdSub)
    .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterNdSub)

@@ -557,6 +657,9 @@ REG_OP(ScatterNdSub)

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator TensorScatterSub.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(TensorScatterSub)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
@@ -573,7 +676,7 @@ REG_OP(TensorScatterSub)
 *@li var: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@li indices: An ND Tensor.
 *Must be one of the following types: int32, int64
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@par Attributes:
@@ -587,10 +690,10 @@ REG_OP(TensorScatterSub)
 * Compatible with the TensorFlow operator ScatterSub.
 */
 REG_OP(ScatterSub)
    .INPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16, DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterSub)

@@ -761,7 +864,7 @@ REG_OP(ConfusionMatrix)
 *@li var: An ND Tensor.
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@li indices: An ND Tensor.
 *Must be one of the following types: int32
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor . \n

 *Must be one of the following types: float16, float, int32, int8, uint8
@@ -778,7 +881,7 @@ REG_OP(ConfusionMatrix)
 */
 REG_OP(ScatterMul)
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
@@ -791,13 +894,13 @@ REG_OP(ScatterMul)
 *@par Inputs:
 * Three inputs, including:
 *@li var: An ND Tensor.
 *Must be one of the following types: float16, float, int32
 *Must be one of the following types: float16, float, int32, int8, uint8

 *@li indices: An ND Tensor.
 *Must be one of the following types: int32
 *Must be one of the following types: int32 or int64

 *@li updates: An ND Tensor.
 *Must be one of the following types: float16, float, int32
 *Must be one of the following types: float16, float, int32, int8, uint8

 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False". If "True", the operation
@@ -810,10 +913,10 @@ REG_OP(ScatterMul)
 * Compatible with the TensorFlow operator ScatterMin.
 */
 REG_OP(ScatterMin)
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterMin)

@@ -824,13 +927,13 @@ REG_OP(ScatterMin)
 * Three inputs, including:
 *@li var: An ND Tensor . \n

 *Must be one of the following types: float16, float, int32
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@li indices: An NCHW, NHWC, or ND Tensor . \n

 *Must be one of the following types: int32
 *Must be one of the following types: int32 or int64
 *@li updates: An NCHW, NHWC, or ND Tensor . \n

 *Must be one of the following types: float16, float, int32
 *Must be one of the following types: float16, float, int32, int8, uint8

 *@par Attributes:
 *use_locking: An optional bool. Defaults to "False".
@@ -843,10 +946,10 @@ REG_OP(ScatterMin)
 * Compatible with the TensorFlow operator ScatterMax.
 */
 REG_OP(ScatterMax)
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterMax)

@@ -860,7 +963,7 @@ REG_OP(ScatterMax)
 *Must be one of the following types: float16, float, int32, int8, uint8
 *@li indices: An ND Tensor . \n

 *Must be one of the following types: int32
 *Must be one of the following types: int32 or int64
 *@li updates: An ND Tensor . \n

 *Must be one of the following types: float16, float, int32, int8, uint8
@@ -876,10 +979,10 @@ REG_OP(ScatterMax)
 * Compatible with the TensorFlow operator ScatterUpdate.
 */
 REG_OP(ScatterUpdate)
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
    .INPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(updates, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(var, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(ScatterUpdate)

@@ -979,6 +1082,137 @@ REG_OP(MatrixDiagV2)
    .OUTPUT(output, TensorType::BasicType())
    .OP_END_FACTORY_REG(MatrixDiagV2)

 /**
 * @brief Add updates to var_out according to axis and indices.

 * @par Inputs:
 * Three inputs, including:
 * @li var: A Tensor. Must be one of the following types:
 *     float16, float32, int32, int8, uint8.
 * @li indices: A Tensor of the indices, type should be int32.
 * @li updates: A Tensor of the same type as "var".

 * @par Attributes:
 * @li axis: An required int to specify the axis to perform indices add.

 * @par Outputs:
 * @li var_out: A Tensor. Same as input "var".

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator index_add.

 * @par Restrictions:
 * Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(IndexAdd)
    .INPUT(var, TensorType({DT_INT32, DT_INT8, DT_UINT8, DT_FLOAT32, DT_FLOAT16}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_INT32, DT_INT8, DT_UINT8, DT_FLOAT32, DT_FLOAT16}))
    .OUTPUT(var_out, TensorType({DT_INT32, DT_INT8, DT_UINT8, DT_FLOAT32, DT_FLOAT16}))
    .ATTR(axis, Int, 0)
    .OP_END_FACTORY_REG(IndexAdd)

 /**
 *@brief: Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input \n

 *@par Inputs:
 * Two inputs, including:
 *@li x: A Tensor. Must be one of the following types:
 *    float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *    qint8, quint8, qint32, uint16, complex128, uint32, uint64.
 *@li diagonal:(int, optional) – the diagonal to consider。\n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the Pytorch operator Triu.
 */
 REG_OP(Triu)
    .INPUT(x, TensorType::BasicType())
    .ATTR(diagonal, Int, 0)
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(Triu)

 /**
 *@brief: Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input \n

 *@par Inputs:
 * Two inputs, including:
 *@li x: A Tensor. Must be one of the following types:
 *    float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *    qint8, quint8, qint32, uint16, complex128, uint32, uint64.
 *@li diagonal:(int, optional) – the diagonal to consider。\n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the Pytorch operator Tril.
 */
 REG_OP(Tril)
    .INPUT(x, TensorType::BasicType())
    .ATTR(diagonal, Int, 0)
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(Tril)
 /**
 *@brief Concatenates a list of N tensors along the first dimension.
 *@par Inputs:
 * Two inputs, including:
 * @li values: A list of Tensors. Must be one of the following types:  int32, float16, float32.
 *     Tensors to be concatenated. All must have size 1 in the first dimension and same shape.
 *     It's a dynamic input.
 * @li shape: A Tensor of the same type as "x".
 * The final shape of the result. Should be equal to the shapes of any input
 * but with the number of input values in the first dimension . \n

 *@par Attributes:
 *equation: The subscripts for the Einstein summation. \n
 *N: tensor size of input \n

 *@par Outputs:
 *@li y: Sums the product of the elements of the input operands along dimensions specified
 using a notation based on the Einstein summation convention. \n

 *@attention Constraints:
 *Input N must be Int. \n

 *@par Third-party framework compatibility
 *Compatible with Pytorch einsum operator.
 */
 REG_OP(Einsum)
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .REQUIRED_ATTR(equation, String)
    .REQUIRED_ATTR(N, Int)
    .OP_END_FACTORY_REG(Einsum)

 /**
 *@brief Returns a 2-D tensor with ones on the diagonal and zeros elsewhere. \n

 *@par Inputs:
 *No inputs

 *@par Attributes:
 *@li num_rows: An required int. \n
 *@li num_columns: An optional int.Defaults to 0. \n
 *@li batch_shape: An optional ListInt.Defaults to []. \n
 *@li dtype: An optional int.Defaults to 0. \n

 *@par Outputs:
 *y: A Tensor with targeted type and shape. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Eye. \n
 */
 REG_OP(Eye)
    .OUTPUT(y, TensorType::BasicType())    /* "Result, has targeted element type" */
    .REQUIRED_ATTR(num_rows, Int)
    .ATTR(num_columns, Int, 0)
    .ATTR(batch_shape, ListInt, {})
    .ATTR(dtype, Int, 0)
    .OP_END_FACTORY_REG(Eye)

 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_MATRIX_CALCULATION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -144,6 +144,64 @@ REG_OP(BatchNorm)
 /**
 *@brief Performs batch normalization . \n

 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li x: A 3D or 6D Tensor of type float16 or float32, with format NDHWC or NCDHW for 4D or NDC1HWC0 for 6D.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NDC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NC1HWC0. Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
 operation is used for training.
 *@li variance: A Tensor of type float32. Must be 3D if input "x" is with format NHWC or NCHW. Must be
 5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n

 *@par Attributes:
 *@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001".
 *@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n

 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li y: A 3D or 6D Tensor of type float16 or float32 for the normalized "x", with format NDHWC or NCDHW for 4D or NDC1HWC0 for 6D.
 *@li batch_mean: A Tensor of type float32. Must be 3D if input "x" is with format NDHWC or NCDHW. Must be 6D
 if input "x" is with format NDC1HWC0. Specifies the mean of "x".
 *@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW.
 Must be 6D if input "x" is with format NDC1HWC0. Specifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NDHWC or NCDHW.
 Must be 6D if input "x" is with format NDC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 6D if input "x" is with format NDC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n

 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
 then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has the same value as "variance".
 *@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n

 *@par Third-party framework compatibility
 *@li Compatible with the TensorFlow operator fused_batch_norm.
 *@li Compatible with the TensorFlow operator fused_batch_norm_v2.
 */
 REG_OP(BatchNorm3D)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(scale, TensorType({DT_FLOAT}))
    .INPUT(offset, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(mean, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(variance, TensorType({DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(batch_mean, TensorType({DT_FLOAT}))
    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_1, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_2, TensorType({DT_FLOAT}))
    .ATTR(epsilon, Float, 0.0001)
    .ATTR(data_format, String, "NCDHW")
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(BatchNorm3D)
 /**
 *@brief Performs batch normalization . \n

 *@par Inputs:
 * Five inputs, including: (NHWC or NCHW supported)
 *@li x: A 4D Tensor of type float16 or float32.
@@ -242,6 +300,52 @@ REG_OP(BatchNormGrad)
 /**
 *@brief Performs the backpropagation of BatchNorm . \n

 *@par Inputs:
 * Five inputs, including:
 *@li y_backprop: A 3D or 6D Tensor of type float16 or float32, with format NDHWC, NCDHW, or NDC1HWC0, for the gradient.
 *@li x: A 3D or 6D Tensor of type float16 or float32, with format NDHWC, NCDHW, or NDC1HWC0.
 *@li scale: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW, or NDC1HWC0.
 *@li reserve_space_1: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW, or NC1HWC0. It is an output of BatchNorm.
 *@li reserve_space_2: A 3D or 6D Tensor of type float32, with format NDHWC, NCDHW, or NC1HWC0. It is an output of BatchNorm . \n

 *@par Attributes:
 *@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x".
 *@li data_format: An optional string. Defaults to "NCDHW".
 *@li is_training: An optional bool. Defaults to "true". Specifies the operation is for training (default) or inference . \n

 *@par Outputs:
 *@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "x".
 *@li scale_backprop: A Tensor of type float32, with format NDHWC, NCDHW, or NDC1HWC0, for the offset of "scale".
 *@li *offset_backprop: A Tensor of type float32, with format NDHWC, NCDHW, or NDC1HWC0, for the offset of "offset".
 *@li *reserve_space_4: A Tensor of type float32, with shape NDHWC, NCDHW, or NDC1HWC0. Pass "None" to skip this output.
 *@li *reserve_space_5: A Tensor of type float32, with shape NDHWC, NCDHW, or NDC1HWC0. Pass "None" to skip this output . \n

 *@attention Constraints:
 * The preceding layer of this operator must be operator BatchNorm . \n

 *@see BatchNorm
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operators FusedBatchNormGradV2 and FusedBatchNorm3DGrad.
 */
 REG_OP(BatchNorm3DGrad)
    .INPUT(y_backprop, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(scale, TensorType({DT_FLOAT}))
    .INPUT(reserve_space_1, TensorType({DT_FLOAT}))
    .INPUT(reserve_space_2, TensorType({DT_FLOAT}))
    .OUTPUT(x_backprop, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(scale_backprop, TensorType({DT_FLOAT}))
    .OUTPUT(offset_backprop, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_4, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_5, TensorType({DT_FLOAT}))
    .ATTR(epsilon, Float, 0.0001)
    .ATTR(data_format, String, "NCDHW")
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(BatchNorm3DGrad)

 /**
 *@brief Performs the backpropagation of BatchNorm . \n

 *@par Inputs:
 * Five inputs, including:
 *@li y_backprop: A 4D Tensor of type float16 or float32, with format NHWC or NCHW, for the gradient.
@@ -315,35 +419,7 @@ REG_OP(BNInference)
    .ATTR(use_global_stats, Bool,true)
    .ATTR(mode, Int,1)
    .OP_END_FACTORY_REG(BNInference)
 /**
 *@brief aicpu batch normalization host  . \n

 *@par Inputs:

 *@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  Specifies the mean used for inference.
 *@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  Specifies the variance used for inference.
 *@li momentum: An optional float, mean and variance's Scale factor
 *@par Attributes:
 *@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional attr, not use
 *@par Outputs:
 *@li alpha: A Tensor of type float16 or float32 for the cpu calculate mean
 *@li beta: A Tensor of type float16 or float32 for the cpu calculate variance
 */
 REG_OP(BnHost)
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(momentum, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OPTIONAL_INPUT(scale, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OPTIONAL_INPUT(offset, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(epsilon, Float, 0.00001)
    .ATTR(mode, Int, 1)
    .ATTR(use_global_stats, Bool, true)
    .OUTPUT(alpha, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(mu, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OP_END_FACTORY_REG(BnHost)
 /**
 *@brief Performs batch normalization . \n

--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -365,6 +365,25 @@ REG_OP(BiasAddGrad)
 * 4-D with shape [batch, out_height, out_width, out_channels]
 * or [batch, out_channels, out_height, out_width].
 * Gradients with respect to the output of the convolution.
 *\n
 *\n
 * The following are the supported data types and data formats:
 *@verbatim
    | Tensor    | out_bckprop | filter  | y
    ------------|-------------|---------|--------
    | Data Type | float16     | float16 | float16
    |           |-------------|---------|--------
    |           | float32     | float32 | float32
    |           |-------------|---------|--------
    |           | float64     | float64 | float64
    ------------|-------------|---------|--------
    | Format    | NCHW        | NCHW    | NCHW
    |           | NHWC        | HWCN    | NHWC
@endverbatim
 * For float32 and float64 type, the actual calculation on the chip is based on
 * float16.
 *\n
 *
 *@par Attributes:
 * Five attributes:
 * @li strides: A tuple/list of 4 integers. The stride of the sliding window
@@ -377,8 +396,53 @@ REG_OP(BiasAddGrad)
 * channels.
 * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
 * "NHWC". Specify the data format of the input and output data.
 *\n
 *\n
 * The following value range restrictions must be met:
 *@verbatim
    | Name             | Field    | Scope
    -------------------|----------|--------------
    | input_size       | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | Filter           | H        | [1, 255]
    |                  | W        | [1, 255]
    -------------------|----------|--------------
    | out_backprop     | H*strideH| [1, 4096]
    |                  | W*strideW| [1, 4096]
    -------------------|----------|--------------
    | y(fmap)          | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | Stride           | H        | [1, 63]
    |                  | W        | [1, 63]
    -------------------|----------|--------------
    | Padding          | Top      | [0, 255]
    |                  | Bottom   | [0, 255]
    |                  | Left     | [0, 255]
    |                  | Right    | [0, 255]
    -------------------|----------|--------------
    | Dilation         | H        | [1, 255]
    |                  | W        | [1, 255]

@endverbatim
 * In Ascend910, fmap or out_backprop's H and W not support 1 when
 * fmap_h + pad_top + pad_bottom != (filter_height - 1) * dilation_h + 1
 * If filter_h = 1 and filter_w = 1, out_backprop_w * stride_h * stride_w < 4096
 *\n
 *
 *@par Outputs:
 * y: A Tensor. Has the same type as filter,and has same format as input_size.
 *\n
 *     out_backprop_height = (fmap_height + pad_top + pad_bottom -
 *                           (dilation_h * (filter_height - 1) + 1))
 *                           / stride_h + 1
 *\n
 *     out_backprop_width = (fmap_width + pad_left + pad_right -
 *                          (dilation_w * (filter_width - 1) + 1))
 *                          / stride_w + 1
 *\n
 *
 *@par Third-party framework compatibility
 * Compatible with Tensorflow's conv2d_backprop_input
 */
@@ -454,6 +518,21 @@ REG_OP(Conv2DBackpropInputD)
 * @li bias: An optional tensor. Must have the same type as "y".
 * @li offset_w: An optional 1D tensor for quantized deconvolution.
 * Type is int8. Reserved.\n
 *\n
 *\n
 * The following are the supported data types and data formats:
 *@verbatim
    | Tensor    | x       | filter  | bias    | y
    ------------|---------|---------|---------|--------
    | Data Type | float16 | float16 | float16 | float16
    |           |---------|---------|---------|--------
    |           | int8    | int8    | int32   | int32
    ------------|---------|---------|---------|--------
    | Format    | NCHW    | NCHW    | ND      | NCHW
@endverbatim
 * For int8, a dequant or requant operator must be followed.
 *\n
 *
 *@par Attributes:
 * Six attributes:
 * @li strides: A tuple or list of 2 integers. The stride of the sliding window
@@ -467,9 +546,54 @@ REG_OP(Conv2DBackpropInputD)
 * @li data_format: An optional string from: "NCHW". Defaults to "NCHW". \n
  Specify the data format of the input and output data.
 * @li offset_x: An optional integer for quantized deconvolution.
 * Defaults to "0".
 * The negative offset added to the input image for int8 type. Ensure offset_x
 * within the effective range of int8 [-128, 127]. Defaults to "0".
 *\n
 *\n
 * The following value range restrictions must be met:
 *@verbatim
    | Name             | Field    | Scope
    -------------------|----------|--------------
    | x (out_backprop) | H*strideH| [1, 4096]
    |                  | W*strideW| [1, 4096]
    -------------------|----------|--------------
    | Filter           | H        | [1, 255]
    |                  | W        | [1, 255]
    -------------------|----------|--------------
    | y (fmap)         | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | Stride           | H        | [1, 63]
    |                  | W        | [1, 63]
    -------------------|----------|--------------
    | Padding          | Top      | [0, 255]
    |                  | Bottom   | [0, 255]
    |                  | Left     | [0, 255]
    |                  | Right    | [0, 255]
    -------------------|----------|--------------
    | Dilation         | H        | [1, 255]
    |                  | W        | [1, 255]
    -------------------|----------|--------------
    | Offset_x         |          | [-128, 127]

@endverbatim
 * In Ascend910, fmap or out_backprop's H and W not support 1 when
 * fmap_h + pad_top + pad_bottom != (filter_height - 1) * dilation_h + 1
 * If filter_h = 1 and filter_w = 1, out_backprop_w * stride_h * stride_w < 4096
 *\n
 *
 *@par Outputs:
 * y: A Tensor. 4D tensor with shape [batch, channels, height, width].
 *\n
 *     out_backprop_height = (fmap_height + pad_top + pad_bottom -
 *                           (dilation_h * (filter_height - 1) + 1))
 *                           / stride_h + 1
 *\n
 *     out_backprop_width = (fmap_width + pad_left + pad_right -
 *                          (dilation_w * (filter_width - 1) + 1))
 *                          / stride_w + 1
 *\n
 *
 * When type of x is float16, the type of y must be float16.
 * When type of x is int8, the type of y must be int32.
 */
@@ -502,6 +626,25 @@ REG_OP(Deconvolution)
 * [batch, out_height, out_width, out_channels] or [batch, out_channels,
 * out_height, out_width]. Gradients with respect to the output of the
 * convolution.
 *\n
 *\n
 * The following are the supported data types and data formats:
 *@verbatim
    | Tensor    | x       | out_backprop | y
    ------------|---------|--------------|---------
    | Data Type | float16 |    float16   | float16
    |           |---------|--------------|---------
    |           | float32 |    float32   | float32
    |           |---------|--------------|---------
    |           | float64 |    float64   | float64
    |-----------|---------|--------------|---------
    | Format    | NCHW    |     NCHW     | NCHW
    |           | NHWC    |     NHWC     | HWCN
@endverbatim
 * For float32 and float64 type of x and outbackprop, the actual calculation on the chip
 * is based on float16.
 *\n
 *
 *@par Attributes:
 * Five attributes:
 * @li strides: A tuple/list of 4 integers. The stride of the sliding window
@@ -514,8 +657,52 @@ REG_OP(Deconvolution)
 * channels.
 * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to
 * "NHWC". Specify the data format of the input and output data.
 *\n
 *\n
 * The following value range restrictions must be met:
 *@verbatim
    | Name             | Field    | Scope
    -------------------|----------|--------------
    | x(fmap)          | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | Filter Size      | H        | [1, 255]
    |                  | W        | [1, 255]
    -------------------|----------|--------------
    | out_backprop     | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | y                | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | Stride           | H        | [1, 63]
    |                  | W        | [1, 63]
    -------------------|----------|--------------
    | Padding          | Top      | [0, 255]
    |                  | Bottom   | [0, 255]
    |                  | Left     | [0, 255]
    |                  | Right    | [0, 255]
    -------------------|----------|--------------
    | Dilation         | H        | [1, 255]
    |                  | W        | [1, 255]

@endverbatim
 * In Ascend910, out_backprop's H and W not support 1 when
 * fmap_h + pad_top + pad_bottom != (filter_height - 1) * dilation_h + 1
 *\n
 *
 *@par Outputs:
 * y: A Tensor. Has the same type as x, has the same format as filter_size.
 *\n
 *     out_backprop_height = (in_height + pad_top + pad_bottom -
 *                           (dilation_h * (filter_height - 1) + 1))
 *                           / stride_h + 1
 *\n
 *     out_backprop_width = (in_width + pad_left + pad_right -
 *                          (dilation_w * (filter_width - 1) + 1))
 *                          / stride_w + 1
 *\n
 *
 *@par Third-party framework compatibility
 * Compatible with Tensorflow's conv2d_backprop_filter
 */
@@ -597,16 +784,14 @@ REG_OP(Conv2DBackpropFilterD)
    | Tensor    | x       | filter  | bias    | y
    ------------|---------|---------|---------|--------
    | Data Type | float16 | float16 | float16 | float16
    |           |---------|---------|---------|--------
    |           | float32 | float32 | float32 | float32
    |           |---------|---------|---------|--------
    |           | int8    | int8    | int32   | int32
    ------------|---------|---------|---------|--------
    | Format    | NCHW    | NCHW    | ND      | NCHW
    |           | NHWC    | HWCN    |         | NHWC
@endverbatim
 * For float32 type, the actual calculation on the chip is based on
 * float16. For int8, a dequant or requant operator must be followed.
 * float16.
 *\n
 *
 *@par Attributes:
@@ -617,8 +802,7 @@ REG_OP(Conv2DBackpropFilterD)
 * (top, bottom, left, right) side of the input.
 *@li dilations: Optional. A list of 4 integers. The dilation factor for each
 * dimension of input. The dimension order is determined by the data format of
 * "x". The N and C dimensions must be set to 1. The H and W dimensions must be
 * set to 1 for int8 type. Defaults to [1, 1, 1, 1].
 * "x". The N and C dimensions must be set to 1. Defaults to [1, 1, 1, 1].
 *@li groups: Optional. An integer of type int32. The number of blocked
 * connections from input channels to output channels. In_channels and
 * out_channels must both be divisible by "groups". Defaults to 1.
@@ -652,6 +836,8 @@ REG_OP(Conv2DBackpropFilterD)
    | Offset_x         |          | [-128, 127]

@endverbatim
 * The W dimension of the input image supports cases exceeding 4096, but it may
 * cause compilation errors.
 *\n
 *
 *@par Outputs:
@@ -666,21 +852,6 @@ REG_OP(Conv2DBackpropFilterD)
 *     out_width = (in_width + pad_left + pad_right -
 *                  (dilation_w * (filter_width - 1) + 1))
 *                 / stride_w + 1
 *
 *@attention Constraints:
 *@li The following restrictions on the output must be met:
 *@verbatim
    | Output  | Restrictions
    ----------|--------------------------------
    | H == 1  | H * W(input) == H * W(filter)
    | W == 1  |
    ----------|--------------------------------
    | H != 1  | W(input) == W(filter)
    | W == 1  | Only for Ascend310 Hi3796V300CS
@endverbatim
 * "H * W (input)" indicates the image size after padding and "H * W (filter)"
 * indicates the filter size after dilation."W(input)" and W(filter) indicate
 * the same rule on the W dimension.
 *\n
 *
 *@par Quantization supported or not
@@ -778,7 +949,7 @@ REG_OP(Conv2DCompress)
 * With the format "HWCN" , the data is stored in the order of: [filter_height,
 * filter_width, in_channels / groups, out_channels].
 *@li offsets: A 4D tensor of x-y coordinates offset and mask. With the format
 * "NHWC", the data is stored in the order of: [batch, in_height, in_width,
 * "NHWC", the data is stored in the order of: [batch, out_height, out_width,
 * deformable_groups * filter_height * filter_width * 3].
 *@li bias: An optional 1D tensor of additive biases to the filter outputs.
 * The data is stored in the order of: [out_channels].
@@ -816,31 +987,20 @@ REG_OP(Conv2DCompress)
 *@li deformable_groups: Optional. An integer of type int32. The number of
 * deformable group partitions. In_channels must be divisible by
 * "deformable_groups". Defaults to 1.
 *@li modulated: Optional. Specify version of DeformableConv2D, true means v2,
 * false means v1, currently only support v2.
 *\n
 *\n
 * The following value range restrictions must be met:
 *@verbatim
    | Name              | Field  | Scope
    --------------------|--------|----------------------------
    | Input Image Size  | H      | [1, 100000]
    |                   | W      | [1, 4096]
    --------------------|--------|----------------------------
    | Filter Size       | H      | [1, 255]
    |                   | W      | [1, 255]
    | Input Image Size  | H      | [1, 100000 / filter_height]
    |                   | W      | [1, 4096 / filter_width]
    --------------------|--------|----------------------------
    | Stride            | H      | [1, 63]
    | Filter Size       | H      | [1, 63]
    |                   | W      | [1, 63]
    --------------------|--------|----------------------------
    | Padding           | Top    | [0, 255]
    |                   | Bottom | [0, 255]
    |                   | Left   | [0, 255]
    |                   | Right  | [0, 255]
    ------------ -------|--------|----------------------------
    | Dilation          | H      | [1, 255]
    |                   | W      | [1, 255]
@endverbatim
 * "W(input)" indicate the image width after padding and W(filter) indicates the
 * filter width after dilation.
 *\n
 *
 *@par Outputs:
@@ -855,21 +1015,7 @@ REG_OP(Conv2DCompress)
 *     out_width = (in_width + pad_left + pad_right -
 *                  (dilation_w * (filter_width - 1) + 1))
 *                 / stride_w + 1
 *
 *@attention Constraints:
 *@li The following restrictions on the output must be met:
 *@verbatim
    | Output  | Restrictions
    ----------|--------------------------------
    | H == 1  | H * W(input) == H * W(filter)
    | W == 1  |
    ----------|--------------------------------
    | H != 1  | W(input) == W(filter)
    | W == 1  | Only for Ascend310 Hi3796V300CS
@endverbatim
 * "H * W(input)" indicates the image size after padding and "H * W(filter)"
 * indicates the filter size after dilation. "W(input)" and W(filter) indicate
 * the same rule on the W dimension.
 *\n
 *
 *@par Quantization supported or not
 *@li No
@@ -891,6 +1037,7 @@ REG_OP(DeformableConv2D)
    .ATTR(groups, Int, 1)
    .ATTR(data_format, String, "NHWC")
    .ATTR(deformable_groups, Int, 1)
    .ATTR(modulated, Bool, true)
    .OP_END_FACTORY_REG(DeformableConv2D)

 /**
@@ -916,12 +1063,12 @@ REG_OP(DeformableConv2D)

 *@par Attributes:
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.
 * @li dilations: A list of 5 integers. Specifies the dilation factor for each
 * dimension of "x", now only support [1,1,1,1,1]
 * The N and C dimensions must be 1. Has the same format as "x".
 * dimension of "x".
 * The N, C and D dimensions must be 1. Has the same format as "x".
 * @li offset_x: An optional int. Input offset, used for quantized inference.
 * Defaults to 0. Reserved . \n

@@ -967,8 +1114,8 @@ REG_OP(Conv3D)

 *@par Required Attributes:
 * @li strides: A list of 5 integers. Specifies the stride of the sliding window
 * for each dimension of "x".
 * The N and C dimensions must be 1. Has the same format as "x".
 * for each dimension of "out_backprop".
 * The N and C dimensions must be 1. Has the same format as "out_backprop".
 * @li pads: A list of 6 integers.
 * Supports only padding along the D, H and W dimensions in sequence of head,
 * tail, top, bottom, left and right . \n
@@ -976,14 +1123,15 @@ REG_OP(Conv3D)
 *@par Attributes:
 * Three attributes:
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.
 * @li dilations: A tuple/list of 5 integers, The dilation factor for each
 * dimension of the input, now only support [1,1,1,1,1]
 * dimension of the input.
 * The N, C and D dimensions must be 1. Has the same format as "out_backprop".

 *@par Outputs:
 * y: A Tensor. Has the same type as filter,and has same format as input_size
 * y: A Tensor. Has the same type as filter,and has same format as "input_size"

 *@par Third-party framework compatibility
 * Compatible with Tensorflow's conv3d_backprop_input
@@ -1011,8 +1159,8 @@ REG_OP(Conv3DBackpropInput)

 *@par Required Attributes:
 * @li strides: A list of 5 integers. Specifies the stride of the sliding window
 * for each dimension of "x".
 * The N and C dimensions must be 1. Has the same format as "x".
 * for each dimension of "out_backprop".
 * The N and C dimensions must be 1. Has the same format as "out_backprop".
 * @li pads: A list of 6 integers. Supports only padding along the D, H and W
 * dimensions in sequence of head, tail, top, bottom, left and right.
 * @li input_size: A tuple/list of type int32, int64. An integer vector
@@ -1023,13 +1171,14 @@ REG_OP(Conv3DBackpropInput)
 *@par Attributes:
 * Three attributes:
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.
 * @li dilations: A tuple/list of 5 integers, The dilation factor for each
 * dimension of input, now only support [1,1,1,1,1]
 * dimension of input.
 * The N, C and D dimensions must be 1. Has the same format as "out_backprop".
 *@par Outputs:
 * y: A Tensor. Has the same type and data format as out_backprop.
 * y: A Tensor. Has the same type and data format as "out_backprop".
 *@par Third-party framework compatibility
 * Compatible with Tensorflow's conv3d_backprop_input

@@ -1072,9 +1221,7 @@ REG_OP(Conv3DBackpropInputD)
 * @li c_t: A optinal Tensor dtype of float16, float32. The cell state at time t . \n

 *@par Third-party framework compatibility:
 * Compatible with the Pytorch operator adds.
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 * Compatible with the Caffe operator LSTM.
 */
 REG_OP(LSTM)
    .INPUT(x, TensorType({DT_FLOAT16}))
@@ -1121,14 +1268,15 @@ REG_OP(LSTM)
 *@par Attributes:
 * Three attributes:
 * @li dilations: A tuple/list of 5 integers, The dilation factor for each
 * dimension of input, now only support [1,1,1,1,1].
 * dimension of input.
 * The N, C and D dimensions must be 1. Has the same format as "x".
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.

 *@par Outputs:
 * y: A Tensor that has the same type as x
 * y: A Tensor that has the same type as "x"
 * and the format is NDHWC, NCDHW or DHWCN.
 *@par Third-party framework compatibility
 * Compatible with Tensorflow's conv3d_backprop_filter
@@ -1172,9 +1320,10 @@ REG_OP(Conv3DBackpropFilter)
 *@par Attributes:
 * Three attributes:
 * @li dilations: A tuple/list of 5 integers, The dilation factor for each
 * dimension of input, now only support [1,1,1,1,1].
 * dimension of input.
 * The N, C and D dimensions must be 1. Has the same format as "x".
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.

@@ -1224,15 +1373,16 @@ REG_OP(Conv3DBackpropFilterD)
 *@par Attributes:
 * Five attributes:
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li dilations: A tuple/list of 5 integers,
 * The dilation factor for each dimension of input, now only support [1,1,1,1,1]
 * The dilation factor for each dimension of input.
 * The N, C and D dimensions must be 1. Has the same format as "x".
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.
 * @li output_padding: The size will be added in the output shape.
 * @li offset_x: Input offset_x value. Reserved.
 *@par Outputs:
 * y: A Tensor. Has the same type and format as x.
 * y: A Tensor. Has the same type and format as "x".
 */
 REG_OP(Conv3DTranspose)
    .INPUT(input_size, TensorType({DT_INT32, DT_INT64}))
@@ -1273,15 +1423,16 @@ REG_OP(Conv3DTranspose)
 *@par Attributes:
 * Five attributes:
 * @li dilations: A tuple/list of 5 integers, The dilation factor for each
 * dimension of input, now only support [1,1,1,1,1]
 * dimension of input.
 * The N, C and D dimensions must be 1. Has the same format as "x".
 * @li groups: Number of blocked connections from input channels to output
 * channels. Reserved.
 * channels.
 * @li data_format: An optional string from: "NDHWC", "NCDHW".
 * Defaults to "NDHWC". Specify the data format of the input and output data.
 * @li output_padding: The size will be added in the output shape.
 * @li offset_x: Input offset_x value. Reserved.
 *@par Outputs:
 * y: A Tensor. Has the same type and format as x.
 * y: A Tensor. Has the same type and format as "x".
 *@par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use Conv3DTranspose instead.
 */
@@ -1316,6 +1467,22 @@ REG_OP(Conv3DTransposeD)
 * or [out_channels, in_channel, filter_height, filter_width].
 * @li bias: An optional 1D tensor of type float16 or int32. Format is "ND".
 * @li offset_w: An optional 1D tensor for quantized inference. Reserved.
 *\n
 *\n
 * The following are the supported data types and data formats:
 *@verbatim
    | Tensor    | x       | filter  | bias    | y
    ------------|---------|---------|---------|--------
    | Data Type | float16 | float16 | float16 | float16
    |           |---------|---------|---------|--------
    |           | int8    | int8    | int32   | int32
    ------------|---------|---------|---------|--------
    | Format    | NCHW    | NCHW    | ND      | NCHW
    |           | NHWC    | HWCN    |         | NHWC
@endverbatim
 * For int8, a dequant or requant operator must be followed.
 *\n
 *
 *@par Required Attributes:
 * @li strides: A required tuple/list of 4 integers. The stride of the sliding
 * window for H/W dimension. The index of H/W is same as data_format.
@@ -1333,10 +1500,58 @@ REG_OP(Conv3DTransposeD)
 * @li output_padding: The size will be added in the output shape. Defaults
 * to [0, 0, 0, 0].
 * @li offset_x: An optional int. Input offset, used for quantized inference.
 * Defaults to "0".
 * The negative offset added to the input image for int8 type. Ensure offset_x
 * within the effective range of int8 [-128, 127]. Defaults to "0".
 *\n
 *\n
 * The following value range restrictions must be met:
 *@verbatim
    | Name             | Field    | Scope
    -------------------|----------|--------------
    | input_size       | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | x (out_backprop) | H*strideH| [1, 4096]
    |                  | W*strideW| [1, 4096]
    -------------------|----------|--------------
    | filter           | H        | [1, 255]
    |                  | W        | [1, 255]
    -------------------|----------|--------------
    | y (fmap)         | H        | [1, 4096]
    |                  | W        | [1, 4096]
    -------------------|----------|--------------
    | Stride           | H        | [1, 63]
    |                  | W        | [1, 63]
    -------------------|----------|--------------
    | Padding          | Top      | [0, 255]
    |                  | Bottom   | [0, 255]
    |                  | Left     | [0, 255]
    |                  | Right    | [0, 255]
    -------------------|----------|--------------
    | Dilation         | H        | [1, 255]
    |                  | W        | [1, 255]
    -------------------|----------|--------------
    | Offset_x         |          | [-128, 127]

@endverbatim
 * In Ascend910, fmap or out_backprop's H and W not support 1 when
 * fmap_h + pad_top + pad_bottom != (filter_height - 1) * dilation_h + 1
 * If filter_h = 1 and filter_w = 1, out_backprop_w * stride_h * stride_w < 4096
 *\n
 *
 *@par Outputs:
 * y: A Tensor. A Tensor of type float16 or int32, and has same format as
 * input_size.
 *\n
 *     out_backprop_height = (fmap_height + pad_top + pad_bottom -
 *                           (dilation_h * (filter_height - 1) + 1))
 *                           / stride_h + 1
 *\n
 *     out_backprop_width = (fmap_width + pad_left + pad_right -
 *                          (dilation_w * (filter_width - 1) + 1))
 *                          / stride_w + 1
 *\n
 *
 */
 REG_OP(Conv2DTranspose)
    .INPUT(input_size, TensorType({DT_INT32, DT_INT64}))
@@ -1405,21 +1620,22 @@ REG_OP(Conv2DTransposeD)
 /**
 *@brief Computes the deformed convolution output with the expected input
 *@par Inputs:
 * Four inputs:
 * Two inputs:
 * @li x: A Tensor of type float16,float32
 * @li offsets: A Tensor of type float16,float32.Deformation offset parameter.
 *@par Required Attributes:
 * @li strides: A tuple/list of 4 integers.The stride of the sliding window for
 * height and width for H/W dimension.
 * @li pads: A tuple/list of 4 integers.Padding added to each dimension
 * @li pads: A tuple/list of 4 integers.Padding added to H/W dimension
 * of the input.
 * @li ksize: A tuple/list of 2 integers.kernel size.
 *@par Attributes:
 * Three attributes:
 * Four attributes:
 * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension
 * of input.  Defaults to [1, 1, 1, 1]
 * @li data_format: An optional string from: "NCHW", "NHWC". Defaults to "NCHW". Specify the data format of the input x.
 * @li deformable_groups: Specify the c-axis grouping number of input x.
 * @li modulated: Specify version of DeformableConv2D, true means v2, false means v1
 *@par Outputs:
 * y: A Tensor. A Tensor of type float16, float32.
 */
@@ -1433,7 +1649,69 @@ REG_OP(DeformableOffsets)
    .ATTR(dilations, ListInt, {1, 1, 1, 1})
    .ATTR(data_format, String, "NCHW")
    .ATTR(deformable_groups, Int, 1)
    .ATTR(modulated, Bool, true)
    .OP_END_FACTORY_REG(DeformableOffsets)

 /**
 *@brief Computes the gradients of DeformableOffsets with respect to input and offsets
 *@par Inputs:
 * Three inputs:
 * @li grad: A Tensor of type float16,float32. gradients with respect to DeformableOffsets output
 * @li x: A Tensor of type float16,float32.
 * @li offsets: A Tensor of type float16,float32.Deformation offset parameter.
 *@par Required Attributes:
 * @li strides: A tuple/list of 4 integers.The stride of the sliding window for
 * height and width for H/W dimension.
 * @li pads: A tuple/list of 4 integers.Padding added to H/W dimension
 * of the input.
 * @li ksize: A tuple/list of 2 integers.kernel size.
 *@par Attributes:
 * Three attributes:
 * @li dilations: A tuple/list of 4 integers, The dilation factor for each dimension
 * of input.  Defaults to [1, 1, 1, 1]
 * @li data_format: An optional string from: "NCHW", "NHWC". Defaults to "NCHW". Specify the data format of the input x.
 * @li deformable_groups: Specify the c-axis grouping number of input x.
 * @li modulated: Specify version of DeformableConv2D, true means v2, false means v1.
 *@par Outputs:
 * grad_x: A Tensor of type float16, float32. Gradients with respect to input_x
 * grad_offsets: A Tensor of type float16, float32. Gradients with respect to input_offsets
 */
 REG_OP(DeformableOffsetsGrad)
    .INPUT(grad, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(offsets, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(grad_x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(grad_offsets, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(pads, ListInt)
    .REQUIRED_ATTR(ksize, ListInt)
    .ATTR(dilations, ListInt, {1, 1, 1, 1})
    .ATTR(data_format, String, "NCHW")
    .ATTR(deformable_groups, Int, 1)
    .ATTR(modulated, Bool, true)
    .OP_END_FACTORY_REG(DeformableOffsetsGrad)

 /**
 *@brief Computes the deformed dilation output with the expected input
 *@par Inputs:
 * One inputs:
 * @li x: A Tensor of type int8, float16, float32
 *@par Required Attributes:
 * @li dilations: A tuple/list of integers.
 *@par Attributes:
 * Two attributes:
 * @li padding_value: default value filling in blank
 * @li pads: A tuple/list of integers.
 *@par Outputs:
 * y: A Tensor. A Tensor of type int8, float16, float32.
 */
 REG_OP(Dilation)
    .INPUT(x, TensorType({DT_INT8, DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_INT8, DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(dilations, ListInt)
    .ATTR(pads, ListInt, {})
    .ATTR(padding_value, Float, 0.0)
    .OP_END_FACTORY_REG(Dilation)

 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_CALCULATION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -254,22 +254,22 @@ is min_size/sqrt(aspect_ratio), the width is min_size*sqrt(aspect_ratio). Defaul
 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
 REG_OP(PriorBox)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
     .REQUIRED_ATTR(min_size, ListFloat)
     .REQUIRED_ATTR(max_size, ListFloat)
     .REQUIRED_ATTR(aspect_ratio, ListFloat)
     .ATTR(img_h, Int, 0)
     .ATTR(img_w, Int, 0)
     .ATTR(step_h, Float, 0.0)
     .ATTR(step_w, Float, 0.0)
     .ATTR(flip, Bool, true)
     .ATTR(clip, Bool, false)
     .ATTR(offset, Float, 0.5)
     .ATTR(variance, ListFloat, {0.1})
     .OP_END_FACTORY_REG(PriorBox);
 REG_OP(PriorBox)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(min_size, ListFloat)
    .REQUIRED_ATTR(max_size, ListFloat)
    .REQUIRED_ATTR(aspect_ratio, ListFloat)
    .ATTR(img_h, Int, 0)
    .ATTR(img_w, Int, 0)
    .ATTR(step_h, Float, 0.0)
    .ATTR(step_w, Float, 0.0)
    .ATTR(flip, Bool, true)
    .ATTR(clip, Bool, false)
    .ATTR(offset, Float, 0.5)
    .ATTR(variance, ListFloat, {0.1})
    .OP_END_FACTORY_REG(PriorBox);

 /**
 *@brief Performs SSD prior box detection, with four additional matrices and the "aspect_ratio" attribute deleted compared to PriorBox . \n
@@ -306,25 +306,25 @@ is min_size/sqrt(aspect_ratio), the width is min_size*sqrt(aspect_ratio). Defaul
 *@par Restrictions:
 *Warning: THIS FUNCTION IS DEPRECATED. Please use PriorBox instead.
 */
 REG_OP(PriorBoxD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(data_h, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(data_w, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(box_height, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(box_width, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
     .REQUIRED_ATTR(min_size, ListFloat)
     .REQUIRED_ATTR(max_size, ListFloat)
     .ATTR(img_h, Int, 0)
     .ATTR(img_w, Int, 0)
     .ATTR(step_h, Float, 0.0)
     .ATTR(step_w, Float, 0.0)
     .ATTR(flip, Bool, true)
     .ATTR(clip, Bool, false)
     .ATTR(offset, Float, 0.5)
     .ATTR(variance, ListFloat, {0.1})
     .OP_END_FACTORY_REG(PriorBoxD);
 REG_OP(PriorBoxD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(data_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(data_w, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(box_height, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(box_width, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(min_size, ListFloat)
    .REQUIRED_ATTR(max_size, ListFloat)
    .ATTR(img_h, Int, 0)
    .ATTR(img_w, Int, 0)
    .ATTR(step_h, Float, 0.0)
    .ATTR(step_w, Float, 0.0)
    .ATTR(flip, Bool, true)
    .ATTR(clip, Bool, false)
    .ATTR(offset, Float, 0.5)
    .ATTR(variance, ListFloat, {0.1})
    .OP_END_FACTORY_REG(PriorBoxD);

 /**
 *@brief Performs SSD prior box detection, with four additional matrices and the "aspect_ratio" attribute deleted compared to PriorBox . \n
@@ -358,22 +358,22 @@ is min_size/sqrt(aspect_ratio), the width is min_size*sqrt(aspect_ratio). Defaul
 *@par Restrictions:
 *Warning: THIS FUNCTION IS DEPRECATED. Please use PriorBox instead.
 */
 REG_OP(PriorBoxDV2)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
     .REQUIRED_ATTR(min_size, ListFloat)
     .REQUIRED_ATTR(max_size, ListFloat)
     .ATTR(img_h, Int, 0)
     .ATTR(img_w, Int, 0)
     .ATTR(step_h, Float, 0.0)
     .ATTR(step_w, Float, 0.0)
     .ATTR(flip, Bool, true)
     .ATTR(clip, Bool, false)
     .ATTR(offset, Float, 0.5)
     .ATTR(variance, ListFloat, {0.1})
     .OP_END_FACTORY_REG(PriorBoxDV2);
 REG_OP(PriorBoxDV2)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(min_size, ListFloat)
    .REQUIRED_ATTR(max_size, ListFloat)
    .ATTR(img_h, Int, 0)
    .ATTR(img_w, Int, 0)
    .ATTR(step_h, Float, 0.0)
    .ATTR(step_w, Float, 0.0)
    .ATTR(flip, Bool, true)
    .ATTR(clip, Bool, false)
    .ATTR(offset, Float, 0.5)
    .ATTR(variance, ListFloat, {0.1})
    .OP_END_FACTORY_REG(PriorBoxDV2);

 /**
 *@brief Performs Position Sensitive ROI Pooling . \n
@@ -531,10 +531,10 @@ as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn . \n
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
 REG_OP(Yolo)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(coord_data, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(obj_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(classes_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(boxes, Int, 3)
    .ATTR(coords, Int, 4)
    .ATTR(classes, Int, 80)
@@ -584,10 +584,10 @@ REG_OP(Yolo)
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
 REG_OP(YoloV2DetectionOutput)
    .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(biases, ListFloat)
    .ATTR(boxes, Int, 5)
    .ATTR(coords, Int, 4)
@@ -598,7 +598,7 @@ REG_OP(YoloV2DetectionOutput)
    .ATTR(score_threshold, Float, 0.5)
    .ATTR(iou_threshold, Float, 0.45)
    .ATTR(pre_nms_topn, Int, 512)
    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(box_out, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(box_out_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(YoloV2DetectionOutput)

@@ -647,12 +647,12 @@ REG_OP(YoloV2DetectionOutput)
 *Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV2DetectionOutput instead.
 */
 REG_OP(YoloV2DetectionOutputD)
    .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(windex, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(hindex, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(biases, ListFloat)
    .ATTR(boxes, Int, 5)
    .ATTR(coords, Int, 4)
@@ -663,7 +663,7 @@ REG_OP(YoloV2DetectionOutputD)
    .ATTR(score_threshold, Float, 0.5)
    .ATTR(iou_threshold, Float, 0.45)
    .ATTR(pre_nms_topn, Int, 512)
    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(box_out, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(box_out_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(YoloV2DetectionOutputD)

@@ -707,16 +707,16 @@ REG_OP(YoloV2DetectionOutputD)
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
 REG_OP(YoloV3DetectionOutput)
    .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data_low, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(coord_data_mid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(coord_data_high, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob_low, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob_high, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob_low, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob_high, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(biases_low, ListFloat)
    .REQUIRED_ATTR(biases_mid, ListFloat)
    .REQUIRED_ATTR(biases_high, ListFloat)
@@ -729,7 +729,7 @@ REG_OP(YoloV3DetectionOutput)
    .ATTR(score_threshold, Float, 0.5)
    .ATTR(iou_threshold, Float, 0.45)
    .ATTR(pre_nms_topn, Int, 512)
    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(box_out, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(box_out_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(YoloV3DetectionOutput)

@@ -776,22 +776,22 @@ s
 *Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV3DetectionOutput instead.
 */
 REG_OP(YoloV3DetectionOutputD)
    .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(windex1, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(windex2, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(windex3, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(hindex1, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(hindex2, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(hindex3, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(coord_data_low, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(coord_data_mid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(coord_data_high, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob_low, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(obj_prob_high, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob_low, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(classes_prob_high, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(img_info, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(windex1, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(windex2, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(windex3, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(hindex1, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(hindex2, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(hindex3, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(biases_low, ListFloat)
    .REQUIRED_ATTR(biases_mid, ListFloat)
    .REQUIRED_ATTR(biases_high, ListFloat)
@@ -804,7 +804,7 @@ REG_OP(YoloV3DetectionOutputD)
    .ATTR(score_threshold, Float, 0.5)
    .ATTR(iou_threshold, Float, 0.45)
    .ATTR(pre_nms_topn, Int, 512)
    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(box_out, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(box_out_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(YoloV3DetectionOutputD)

@@ -848,7 +848,7 @@ There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yol
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
 REG_OP(YoloV3DetectionOutputV2)
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(biases, ListFloat)
    .ATTR(boxes, Int, 3)
    .ATTR(coords, Int, 4)
@@ -862,7 +862,7 @@ REG_OP(YoloV3DetectionOutputV2)
    .ATTR(N, Int, 10)
    .ATTR(resize_origin_img_to_net, Bool, false)
    .ATTR(out_box_dim, Int, 3)
    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(box_out, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(box_out_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(YoloV3DetectionOutputV2)

@@ -910,9 +910,9 @@ REG_OP(YoloV3DetectionOutputV2)
 * Warning: THIS FUNCTION IS DEPRECATED. Please use YoloV3DetectionOutputV2 instead.
 */
 REG_OP(YoloV3DetectionOutputV2D)
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .DYNAMIC_INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT}))
    .DYNAMIC_INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT}))
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .DYNAMIC_INPUT(windex, TensorType({DT_FLOAT16, DT_FLOAT}))
    .DYNAMIC_INPUT(hindex, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(biases, ListFloat)
    .ATTR(boxes, Int, 3)
    .ATTR(coords, Int, 4)
@@ -926,7 +926,7 @@ REG_OP(YoloV3DetectionOutputV2D)
    .ATTR(N, Int, 10)
    .ATTR(resize_origin_img_to_net, Bool, false)
    .ATTR(out_box_dim, Int, 3)
    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(box_out, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(box_out_num, TensorType({DT_INT32}))
    .OP_END_FACTORY_REG(YoloV3DetectionOutputV2D)

@@ -968,8 +968,9 @@ REG_OP(SPP)
 * Three inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature
 * map.
 *@li rois: A tensor of type float16 or float32, with shape
 *@li rois: A tensor of type float16 or float32, with 3D shape
 * [batch, 5, roi_max_num], describing the RIOs.
 * roi_max_num must be less than or equal to 6000 and must be divided by 16.
 *@li roi_actual_num: A  optional tensor of type int32, with shape [batch, 8], specifying
 * the number of ROIs per batch . \n

@@ -1201,35 +1202,6 @@ REG_OP(RpnProposalsD)
    .OUTPUT(sorted_box, TensorType({DT_FLOAT16}))
    .OP_END_FACTORY_REG(RpnProposalsD)

 /**
 *@brief Computes Score Filte Pre-Sort function.

 *@par Inputs:
 *Inputs include:
 * @li rois: A Tensor. Must be float16. N-D with shape [N, 4].
 * @li cls_bg_prob: A Tensor. Must be float16. N-D with shape [N, 1].

 *@par Attributes:
 * @li score_threshold: required, float, threahold of topk process.
 * @li k: required, Int, threahold of topk process.
 * @li score_filter: bool, mark of score_filter. Defaults to "true"
 * @li core_max_num: int, max number of core. Defaults to "8"
 *@par Outputs:
 * @li sorted_proposal: A Tensor. Must be float16.
 *                      N-D with shape [8*6002, 8].
 * @li proposal_num: A Tensor. Must be uint32. N-D with shape [8, 8].
 */

 REG_OP(ScoreFiltePreSort)
    .INPUT(rois, TensorType({DT_FLOAT16}))
    .INPUT(cls_bg_prob, TensorType({DT_FLOAT16}))
    .OUTPUT(sorted_proposal, TensorType({ DT_FLOAT16}))
    .OUTPUT(proposal_num, TensorType({ DT_UINT32}))
    .REQUIRED_ATTR(score_threshold, Float)
    .REQUIRED_ATTR(k, Int)
    .ATTR(score_filter, Bool, true)
    .ATTR(core_max_num, Int, 8)
    .OP_END_FACTORY_REG(ScoreFiltePreSort)

 /**
 *@brief Computes Score Filte Pre-Sort function.
@@ -1383,6 +1355,7 @@ REG_OP(DecodeWheelsTarget)

 *@attention Constraints:
 * Only computation of float16 data is supported.
 * Note: when the class num per image * max_size_per_class is too big, will compile fail with ERROR-insufficient memory
 */
 REG_OP(BatchMultiClassNonMaxSuppression)
    .INPUT(boxes, TensorType({DT_FLOAT16}))
@@ -1464,9 +1437,9 @@ REG_OP(NormalizeBBox)
 * y: A Tensor. Must have the same type as box_predictions.
 */
 REG_OP(DecodeBboxV2)
    .INPUT(boxes, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(anchors, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(anchors, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(scales, ListFloat, {1.0, 1.0, 1.0, 1.0})
    .ATTR(decode_clip, Float, 0.0)
    .ATTR(reversed_box, Bool, false)
@@ -1477,7 +1450,8 @@ REG_OP(DecodeBboxV2)
 *
 *@par Inputs:
 *Inputs include:
 * x: A Tensor. Must be float16 or float32.
 * x: A Tensor. Dtype support: flaot16, flaot, int16, int8,
                          uint8, int32, int64.
 *
 *@par Attributes:
 * @li axis: optional, int.
@@ -1485,16 +1459,364 @@ REG_OP(DecodeBboxV2)
 *
 *@par Outputs:
 * @li y1: A Tensor. Must have the same type as x.
 * @li y2: A Tensor. Indices of y1 in x.Dtype must be int32.
 * @li y2: A Tensor. Indices of y1 in x. Dtype must be int32.
 *
 */
 REG_OP(Sort)
    .INPUT(x, TensorType({ DT_FLOAT16 }))
    .OUTPUT(y1, TensorType({ DT_FLOAT16 }))
    .OUTPUT(y2, TensorType({ DT_INT32 }))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT16, DT_INT8,
                          DT_UINT8, DT_INT32, DT_INT64}))
    .OUTPUT(y1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT16, DT_INT8,
                            DT_UINT8, DT_INT32, DT_INT64}))
    .OUTPUT(y2, TensorType({DT_INT32}))
    .ATTR(axis, Int, -1)
    .ATTR(descending, Bool, false)
    .OP_END_FACTORY_REG(Sort)

 /**
 *@brief Computes iou for input bboxes and gtboxes.

 *@par Inputs:
 * Two inputs, including:
 *@li bboxes: boxes, a 4D Tensor of type float16 with the shape (x0, x1, y0, y1),
 *@li gtboxes: boxes, a 4D Tensor of type float16 with the shape (x0, x1, y0, y1).\n

 *@par Attributes:
 *@li mode: A optional attribute of type string, whether judge the mode of iou. \n

 *@par Outputs:
 *@li overlap: A 2D Tensor of type float16 with shape [n, m]. \n

 *@attention Constraints:
 * Only computation of float16 data is supported.

 *@par Restrictions:
 *Warning:THIS FUNCTION IS DEPRECATED. Please use Iou instead.
 */
 REG_OP(PtIou)
    .INPUT(bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(gtboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(overlap, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(mode, String, "iou")
    .OP_END_FACTORY_REG(PtIou)

 /**
 *@brief Greedily selects a subset of bounding boxes in descending order of
 score . \n

 *@par Inputs:
 *Input boxes and  scores must be float16 type. Inputs include:
 *@li boxes: A input tensor with shape [num_batches,spatial_dimension,4].
 The single box data format is indicated by center_point_box.
 *@li scores: A input tensor with shape [num_batches,num_classes,spatial_dimension]
 *@li max_output_size: A scalar integer tensor representing the maximum number
 of boxes to be selected by non max suppression.
 *@li iou_threshold: A 0-D float tensor representing the threshold for deciding
 whether boxes overlap too much with respect to IOU.
 *@li score_threshold: A 0-D float tensor representing the threshold for
 deciding when to remove boxes based on score . \n

 *@par Attributes:
 *center_point_box:Integer indicate the format of the box data. 
 The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] 
 where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair 
 of box corners and the coordinates can be provided as normalized 
 (i.e., lying in the interval [0, 1]) or absolute.Mostly used for TF models.
 1 - the box data is supplied as [x_center, y_center, width, height].
 Mostly used for Pytorch models. \n

 *@par Outputs:
 *@li selected_indices: A 2-D integer tensor of shape [M] representing the
 selected indices from the boxes tensor, where M <= max_output_size. \n

 *@attention Constraints:
 *Input boxes and  scores must be float16 type . \n

 *@par Third-party framework compatibility
 *Compatible with onnx NonMaxSuppression operator.

 *@par Restrictions:
 *Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */

 REG_OP(NonMaxSuppressionV6)
    .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(scores, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(max_output_size, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(iou_threshold, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(score_threshold, TensorType({DT_FLOAT}))
    .OUTPUT(selected_indices, TensorType({DT_INT32}))
    .ATTR(center_point_box, Int, 0)
    .ATTR(max_boxes_size, Int, 0)
    .OP_END_FACTORY_REG(NonMaxSuppressionV6)

 /**
 *@brief Greedily selects a subset of bounding boxes in descending order of
 score . \n

 *@par Inputs:
 *Input boxes and  scores must be float16 type. Inputs include:
 *@li boxes: A input tensor with shape [num_batches,spatial_dimension,4].
 The single box data format is indicated by center_point_box.
 *@li scores: A input tensor with shape [num_batches,num_classes,spatial_dimension]
 *@li max_output_size: A scalar integer tensor representing the maximum number
 of boxes to be selected by non max suppression.
 *@li iou_threshold: A 0-D float tensor representing the threshold for deciding
 whether boxes overlap too much with respect to IOU.
 *@li score_threshold: A 0-D float tensor representing the threshold for
 deciding when to remove boxes based on score . \n
 *@li index_id: A input tensor with shape [num_batches,num_classes,spatial_dimension,3]
 the last dim representing (batch_id,class_id,index_id)  . \n

 *@par Attributes:
 *center_point_box:Integer indicate the format of the box data. 
 The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] 
 where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair 
 of box corners and the coordinates can be provided as normalized 
 (i.e., lying in the interval [0, 1]) or absolute.Mostly used for TF models.
 1 - the box data is supplied as [x_center, y_center, width, height].
 Mostly used for Pytorch models. \n

 *@par Outputs:
 *@li selected_indices: A 2-D integer tensor of shape [M] representing the
 selected indices from the boxes tensor, where M <= max_output_size. \n

 *@attention Constraints:
 *Input boxes and  scores must be float16 type . \n

 *@par Third-party framework compatibility
 *Compatible with onnx NonMaxSuppression operator.
 */

 REG_OP(NonMaxSuppressionV7)
    .INPUT(boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(scores, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(max_output_size, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(iou_threshold, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(score_threshold, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(index_id, TensorType({DT_FLOAT16}))
    .OUTPUT(selected_indices, TensorType({DT_INT32}))
    .ATTR(center_point_box, Int, 0)
    .ATTR(max_boxes_size, Int, 0)
    .OP_END_FACTORY_REG(NonMaxSuppressionV7)

 /**
 *@brief Obtains the ROI feature matrix from the feature map list. It is a customized fused operator for mmdetection. \n

 *@par Inputs:
 * Three inputs, including:
 *@li features: A 5HD Tensor list of type float32 or float16.
 *@li rois: ROI position. A 2D Tensor of float32 or float16 with shape (N, 5). "N" indicates the number of ROIs,
 * the value "5" indicates the indexes of images where the ROIs are located, "x0", "y0", "x1", and "y1".

 *@par Attributes:
 *@li finest_scale: A optional attribute of type int, specifying the scale of calculate levels of "rois".
 *@li roi_scale_factor: A optional attribute of type float32, specifying the rescaling of "rois" coordinates.
 *@li spatial_scale: A optional attribute of type list float32, specifying the scaling ratio of "features"
 * to the original image.
 *@li pooled_height: A optional attribute of type int32, specifying the H dimension.
 *@li pooled_width: A optional attribute of type int32, specifying the W dimension.
 *@li sample_num: An optional attribute of type int32, specifying the horizontal and vertical sampling frequency
 * of each output. If this attribute is set to "0", the sampling frequency is equal to the rounded up value of "rois",
 * which is a floating point number. Defaults to "0".
 *@li pool_mode: An optional attribute of type string to indicate pooling mode. Defaults to "avg" . \n
 *@li aligned: An optional attribute of type bool, specifying the align to corner. Defaults to true . \n

 *@par Outputs:
 * output: Outputs the feature sample of each ROI position. The format is 5HD Tensor of type float32 or float16.
 * The axis N is the number of input ROIs. Axes H, W, and C are consistent with the values of "pooled_height",
 * "pooled_width", and "features", respectively.

 *@par Third-party framework compatibility
 *Compatible with mmdetection SingleRoIExtractor operator.
 */
 REG_OP(RoiExtractor)
    .DYNAMIC_INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(finest_scale, Int, 56)
    .ATTR(roi_scale_factor, Float, 0)
    .ATTR(spatial_scale, ListFloat, {1.f / 4, 1.f / 8, 1.f / 16, 1.f / 32})
    .ATTR(pooled_height, Int, 7)
    .ATTR(pooled_width, Int, 7)
    .ATTR(sample_num, Int, 0)
    .ATTR(pool_mode, String, "avg")
    .ATTR(aligned, Bool, true)
    .OP_END_FACTORY_REG(RoiExtractor)

 /**
 *@brief Performs Position Sensitive PS ROI Pooling . \n

 *@par Inputs:
 * Two inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the feature
 * map, dimension C1 must be equal to
 * (int(output_dim+15)/C0))*group_size*group_size.
 *@li rois: A tensor of type float16 or float32, with shape
 * [batch, 5, rois_num], describing the ROIs, each ROI consists of five
 * elements: "batch_id", "x1", "y1", "x2", and "y2", which "batch_id" indicates
 * the index of the input feature map, "x1", "y1", "x2", or "y2" must be
 * greater than or equal to "0.0" . \n

 *@par Attributes:
 *@li output_dim: A required int32, specifying the number of output channels,
 * must be greater than 0.
 *@li group_size: A required int32, specifying the number of groups to encode
 * position-sensitive score maps, must be within the range (0, 128).
 *@li spatial_scale: A required float32, scaling factor for mapping the input
 * coordinates to the ROI coordinates . \n

 *@par Outputs:
 *y: An NC1HWC0 tensor of type float16 or float32, describing the result
 * feature map . \n

 *@attention Constraints:
 * HC1HWC0: channel must be Group_size squared, rois_num is a multiple of 16
 */
 REG_OP(PSROIPoolingV2)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(spatial_scale, Float)
    .REQUIRED_ATTR(output_dim, Int)
    .REQUIRED_ATTR(group_size, Int)
    .OP_END_FACTORY_REG(PSROIPoolingV2)

 /**
 *@brief Performs Position Sensitive PS ROI Pooling Grad . \n

 *@par Inputs:
 * Two inputs, including:
 *@li x: An NC1HWC0 tensor of type float16 or float32, describing the result
 * feature map . \n
 *@li rois: A tensor of type float16 or float32, with shape
 * [batch, 5, rois_num], describing the ROIs, each ROI consists of five
 * elements: "batch_id", "x1", "y1", "x2", and "y2", which "batch_id" indicates
 * the index of the input feature map, "x1", "y1", "x2", or "y2" must be
 * greater than or equal to "0.0" . \n

 *@par Attributes:
 *@li output_dim: A required int32, specifying the number of output channels,
 * must be greater than 0.
 *@li group_size: A required int32, specifying the number of groups to encode
 * position-sensitive score maps, must be within the range (0, 128).
 *@li spatial_scale: A required float32, scaling factor for mapping the input
 * coordinates to the ROI coordinates . \n
 *@li input_size: A required listInt, mapping the gradinput size: (H, W)

 *@par Outputs:
 *y: An NC1HWC0 tensor of type float16 or float32, describing the feature
 * map, dimension C1 must be equal to
 * (int(output_dim+15)/C0))*group_size*group_size.

 *@attention Constraints:
 * HC1HWC0: channel must be Group_size squared, rois_num is a multiple of 16
 */
 REG_OP(PSROIPoolingGradV2D)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(rois, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(spatial_scale, Float)
    .REQUIRED_ATTR(output_dim, Int)
    .REQUIRED_ATTR(group_size, Int)
    .REQUIRED_ATTR(input_size, ListInt)
    .OP_END_FACTORY_REG(PSROIPoolingGradV2D)

 /**
 *@brief Generate the responsible flags of anchor in a single feature map.

 *@par Inputs:
 *@li gt_bboxes: Ground truth box, 2-D Tensor with shape `[batch, 4]`.

 *@par Attributes:
 *@li featmap_size: The size of feature maps, listint.
 *@li strides: Stride of current level, listint.
 *@li num_base_anchors: The number of base anchors.

 *@par Outputs:
 *flags: The valid flags of each anchor in a single level.
 */
 REG_OP(AnchorResponseFlags)
    .INPUT(gt_bboxes, TensorType({DT_FLOAT}))
    .OUTPUT(flags, TensorType({DT_UINT8}))
    .REQUIRED_ATTR(featmap_size, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(num_base_anchors, Int)
    .OP_END_FACTORY_REG(AnchorResponseFlags)

 /**
 *@brief Generates bounding boxes based on yolo's "anchor" and "ground-truth" boxes.
 * It is a customized mmdetection operator . \n

 *@par Inputs:
 * Three inputs, including:
 *@li anchor_boxes: anchor boxes generated by the yolo training set.
 *  A 2D Tensor of type float32 or float16 with shape (N, 4). "N" indicates the number
 * of ROIs, "N" indicates the number of ROIs, and the value "4" refers to (tx, ty, tw, th).
 *@li gt_bboxes: target of the transformation, e.g, ground-truth boxes.
 *  A 2D Tensor of type float32 or float16 with shape (N, 4).
 * "N" indicates the number of ROIs, and 4 indicates "dx", "dy", "dw", and "dh" .
 *@li stride: Scale for each box.
 *  A 1D Tensor of type int32 shape (N,).
 * "N" indicates the number of ROIs. \n

 *@par Attributes:
 *@li performance_mode: select performance mode, "high_precision" or "high_performance".
 * select "high_precision" when input type is float32, the output tensor precision
 * will be smaller than 0.0001, select "high_performance" when input type is float32,
 * the ops will be best performance, but precision will be only smaller than 0.005.

 *@par Outputs:
 *encoded_bboxes: Bboxes generated based on "anchor_boxes" and "gt_bboxes". Have the
 * same format and type as "anchor_boxes".
 *
 *@attention Constraints:
 * input anchor boxes only support maximum N=20480. \n
 */
 REG_OP(YoloBoxesEncode)
    .INPUT(anchor_boxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(gt_bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(stride, TensorType({DT_INT32}))
    .ATTR(performance_mode, String, "high_precision")
    .OUTPUT(encoded_bboxes, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(YoloBoxesEncode)

 /**
 *@brief Performs Position Sensitive PS ROI Pooling Grad.

 *@par Inputs:
 * Eight inputs, including:
 *@li assigned_gt_inds: Tensor of type float16 or float32, shape (n, )
 *@li overlaps: A Tensor. Datatype is same as assigned_gt_inds. IOU between gt_bboxes and bboxes. shape(k, n)
 *@li box_responsible_flags: A Tensor. Support uint8. Flag to indicate whether box is responsible.
 *@li max_overlaps: A Tensor. Datatype is same as assigned_gt_inds. overlaps.max(axis=0).
 *@li argmax_overlaps: A Tensor. Support int32. overlaps.argmax(axis=0).
 *@li gt_max_overlaps: A Tensor. Datatype is same as assigned_gt_inds. overlaps.max(axis=1).
 *@li gt_argmax_overlaps: A Tensor. Support int32. overlaps.argmax(axis=1).
 *@li num_gts: A Tensor. Support int32. real k. shape (1, )

 *@par Attributes:
 *@li output_dim: float. IOU threshold for positive bboxes.
 *@li group_size: float. minimum iou for a bbox to be considered as a positive bbox
 *@li spatial_scale: bool. whether to assign all bboxes with the same highest overlap with some gt to that gt.

 *@par Outputs:
 *@li assigned_gt_inds_pos: A Tensor. Support float16/float32. shape (n, ).
 */
 REG_OP(GridAssignPositive)
    .INPUT(assigned_gt_inds, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(overlaps, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(box_responsible_flags, TensorType({ DT_UINT8 }))
    .INPUT(max_overlaps, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(argmax_overlaps, TensorType({ DT_INT32 }))
    .INPUT(gt_max_overlaps, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(gt_argmax_overlaps, TensorType({ DT_INT32 }))
    .INPUT(num_gts, TensorType({ DT_INT32 }))
    .OUTPUT(assigned_gt_inds_pos, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(pos_iou_thr, Float)
    .REQUIRED_ATTR(min_pos_iou, Float)
    .REQUIRED_ATTR(gt_max_assign_all, Bool)
    .OP_END_FACTORY_REG(GridAssignPositive)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_

--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,7 +55,9 @@ REG_OP(LogSoftmaxGrad)
 *Two inputs, including:
 * @li features: A Tensor. Must be one of the following types: half, float32, double.
 *    A "batch_size * num_classes" matrix.
 * @li labels: A Tensor of the same type as "features". batch_size vector with values in [0, num_classes).
 * @li labels: A Tensor. Must be one of the following types: 'int32', 'int64'.
 *             batch_size vector with values in [0, num_classes).
 *             This is the label for the given minibatch entry.


 *@par Outputs:
@@ -105,6 +107,9 @@ REG_OP(SoftmaxCrossEntropyWithLogits)
 * @li grad_softmax: A Tensor. Has the same shape and type as "softmax".
 * The format is NC1HWC0 or DN . \n

 *@par Attributes:
 * axes: An optional list of ints. Defaults to "{-1}" . \n

 *@par Outputs:
 *grad_x: A Tensor. Has the same shape and type as "softmax" . \n

@@ -115,6 +120,7 @@ REG_OP(SoftmaxGrad)
    .INPUT(softmax, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .INPUT(grad_softmax, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .OUTPUT(grad_x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_INT8,DT_UINT8}))
    .ATTR(axes, ListInt, {-1})
    .OP_END_FACTORY_REG(SoftmaxGrad)

 /**
@@ -160,20 +166,20 @@ REG_OP(SigmoidCrossEntropyWithLogits)
    .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogits)

 /**
 *@brief Computes the sigmoid cross entropy loss of "predict" and "target" . \n
 *@brief Computes the sigmoid cross entropy loss of "predict" and "target".

 *@par Inputs:
 * four inputs, including:
 *@li predict: A multi-dimensional Tensor of type float16 or float32, specifying the predictive value.
 *@li target: A multi-dimensional Tensor of type float16 or float32, specifying the target value . \n
 *@li weight: An multi-dimensional Tensor, specifying the weight value. \n
 *@li target: A multi-dimensional Tensor of type float16 or float32, specifying the target value.
 *@li weight: An multi-dimensional Tensor, specifying the weight value.
 *@li pos_weight: An multi-dimensional Tensor, specifying the pos weight value. \n

 *@par Attributes:
 *reduction: A character string from "none", "mean", and "sum", specifying the reduction type to be applied to the output. Defaults to "mean" . \n
 *reduction: A character string from "none", "mean", and "sum", specifying the reduction type to be applied to the output. Defaults to "mean". \n

 *@par Outputs:
 *loss: Sigmoid cross entropy between the predictive value and target value. Has the same dimensions as "predict" . \n
 *loss: Sigmoid cross entropy between the predictive value and target value. Has the same dimensions as "predict". \n

 *@par Third-party framework compatibility
 * Compatible with PyTorch operator BCEWithLogitsLoss.
@@ -330,6 +336,41 @@ REG_OP(SoftmaxV2)
    .ATTR(axes, ListInt, {-1})
    .OP_END_FACTORY_REG(SoftmaxV2)

 /**
 *@brief Function softmax with dropoutDoMaskV3D

 *@par Inputs:
 *Two inputs, including:
 * @li x: A mutable Tensor. The type only support float16.
 * @li mask: A mutable Tensor. Must met all of the following rules:
 *     shape of mask should be 1D.
 *     dtype of mask should be uint8.
 *     value of shape should met the following algorithm:
 *     value = (size(x) + 128 - 1) // 128 * 128

 *@par Attributes:
 * @li keep_prob: A mutable Tensor. Must met all of the following rules:
 *     shape of "keep_prob" should be (1,) or [1,].
 *     Has the same type as "x" . \n
 * @li axes: A list of int. The dimension softmax would be performed on. Defaults
 *     to "[-1]" . \n

 *@par Outputs:
 *y1: A mutable Tensor. Has the same type as "x".
 *y2: A mutable Tensor. Has the same type as "x". \n

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(SoftmaxV2WithDropOutDoMaskV3D)
    .INPUT(x, TensorType({DT_FLOAT16}))
    .INPUT(mask, TensorType({DT_UINT8}))
    .OUTPUT(y1, TensorType({DT_FLOAT16}))
    .OUTPUT(y2, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(keep_prob, Float)
    .ATTR(axes, ListInt, {-1})
    .OP_END_FACTORY_REG(SoftmaxV2WithDropOutDoMaskV3D)

 /**
 *@brief Computes log softmax activations . \n

@@ -427,6 +468,33 @@ REG_OP(MVN)
    .ATTR(eps, Float, 1e-9)
    .OP_END_FACTORY_REG(MVN)

 /**
 *@brief Normalizes the input . \n

 *@par Inputs:
 * One input:
 *x: An NCHW tensor of type float16 or float32 . \n

 *@par Attributes:
 *@li eps: An optional float32 epsilon for not dividing by zero. Defaults to "1e-9" . \n
 *@li axes: A list of Intefers, along which axis to reduce. Defaults to "[0, 2, 3]" . \n

 *@par Outputs:
 *y: An NCHW tensor of type float16 or float32 . \n

 *@attention Constraints:
 * The input tensor must have the NCHW format, whose shape length must be 4.
 *@par Third-party framework compatibility
 * Compatible with the ONNX operator MeanVarianceNormalization.
 */

 REG_OP(MVNV2)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) /* "First operand." */
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))  /* "Result, has same element type as inputs" */
    .ATTR(eps, Float, 1e-9)
    .ATTR(axes, ListInt, {0, 2, 3})
    .OP_END_FACTORY_REG(MVNV2)

 /**
 *@brief Normalizes the input "x1" . \n

@@ -498,6 +566,31 @@ REG_OP(LayerNorm)
    .ATTR(epsilon, Float, 0.0000001)
    .OP_END_FACTORY_REG(LayerNorm)

 /**
 *@brief Returns a tensor where each sub-tensor of input along dimension 
 *       dim is normalized such that the p-norm of the sub-tensor is lower than the value maxnorm. \n

 *@par Inputs:
 *One input, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Attributes:
 * @li p: Specify L_p norm, the type is float. 
 * @li dim: The processed dim, the type is int.
 * @li maxnorm: Threshold for comparison, the type is float.  \n

 *@par Outputs:
 *One outputs, including:
 * @li y: shape and dtype of output, should be same shape and type as input.
 */
 REG_OP(Renorm)
    .INPUT(x, TensorType::BasicType())
    .OUTPUT(y, TensorType::BasicType())
    .REQUIRED_ATTR(p, Float)
    .REQUIRED_ATTR(dim, Int)
    .REQUIRED_ATTR(maxnorm, Float)
    .OP_END_FACTORY_REG(Renorm)

 /**
 *@brief LayerNormGrad operator interface implementation
 *  calculating: dy, x, variance, mean, gamma
@@ -586,6 +679,48 @@ REG_OP(LayerNormXBackprop)
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(LayerNormXBackprop)

 /**
 *@brief LayerNormXBackpropV2 operator interface implementation
 *  calculating: dy, x, variance, mean, gamma
 *  pd_xl = data_dy*data_gamma
 *  pd_var = np.sum(((-0.5)*pd_xl*(data_x - data_mean)
 *           np.power((data_variance + EPSLON), (-1.5))),
 *           reduce_axis, keepdims=True)
 *  pd_mean = np.sum(((-1.0)*pd_xl
 *            np.power((data_variance + EPSLON), (-0.5))),
 *            reduce_axis, keepdims=True)
 *            + pd_var*(1.0/m)
 *            np.sum(((-2.0)*(data_x - data_mean)), reduce_axis, keepdims=True)
 *  pd_x = pd_xl*np.power((data_variance + EPSLON), (-0.5)) +
 *         pd_var*(2.0/m)*(data_x - data_mean) + pd_mean*(1.0/m)
 *  res_for_gamma = (data_x - data_mean) * np.power((data_variance + EPSLON), (-0.5))

 *@par Inputs:
 *Five inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Outputs:
 *Three outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 * @li res_for_gamma: A Tensor. Must be one of the following types: float32.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(LayerNormXBackpropV2)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(res_for_gamma, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(LayerNormXBackpropV2)

 /**
 *@brief LayerNormBetaGammaBackprop operator interface implementation
 *  calculating: dy, x, variance, mean
@@ -629,6 +764,35 @@ REG_OP(LayerNormBetaGammaBackprop)
    .REQUIRED_ATTR(shape_gamma, ListInt)
    .OP_END_FACTORY_REG(LayerNormBetaGammaBackprop)

 /**
 *@brief LayerNormBetaGammaBackpropV2 operator interface implementation
 *  calculating: dy, x, variance, mean
 *  pd_gamma = np.sum((data_dy*res_for_gamma), param_axis, keepdims=True)
 *  pd_beta = np.sum(data_dy, param_axis, keepdims=True)

 *@par Inputs:
 *Three inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Outputs:
 *Three outputs, including:
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(LayerNormBetaGammaBackpropV2)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(res_for_gamma, TensorType({DT_FLOAT}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(shape_gamma, ListInt)
    .OP_END_FACTORY_REG(LayerNormBetaGammaBackpropV2)

 /**
 *@brief Return "output" according to the algorithm of dropout_do_mask:
 *  scale_x = x *(1 / keep_prob)
@@ -656,7 +820,68 @@ REG_OP(DropOutDoMask)
    .INPUT(keep_prob, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DropOutDoMask)
 	

 /**
 *@brief Return "output" according to the algorithm of dropout_do_mask:
 *  scale_x = x *(1 / keep_prob)
 *  output = select(mask == 1, scale_x, 0)

 *@par Inputs:
 *Three inputs, including:
 * @li x: A mutable Tensor. Must be one of the following types:
 *     float16, float32
 * @li mask: A mutable Tensor. Must met all of the following rules:
 *     shape of mask should be 1D.
 *     dtype of mask should be uint8.
 *     value of shape should met the following algorithm:
 *     value = (size(x) + 128 - 1) // 128 * 128
 * @li keep_prob: A mutable Tensor. Must met all of the following rules:
 *     shape of "keep_prob" should be (1,) or [1,].
 *     Has the same type as "x" . \n

 *@par Output:
 *y: A mutable Tensor. Has the same type as "x".
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DropOutDoMaskV3)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mask, TensorType({DT_UINT8}))
    .INPUT(keep_prob, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(DropOutDoMaskV3)

 /**
 *@brief Return "output" according to the algorithm of dropout_do_mask:
 *  scale_x = x *(1 / keep_prob)
 *  output = select(mask == 1, scale_x, 0)

 *@par Inputs:
 *Two inputs, including:
 * @li x: A mutable Tensor. Must be one of the following types:
 *     float16, float32
 * @li mask: A mutable Tensor. Must met all of the following rules:
 *     shape of mask should be 1D.
 *     dtype of mask should be uint8.
 *     value of shape should met the following algorithm:
 *     value = (size(x) + 128 - 1) // 128 * 128
 *@par Attributes:
 * @li keep_prob: A mutable Tensor. Must met all of the following rules:
 *     shape of "keep_prob" should be (1,) or [1,].
 *     Has the same type as "x" . \n

 *@par Output:
 *y: A mutable Tensor. Has the same type as "x".
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DropOutDoMaskV3D)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mask, TensorType({DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(keep_prob, Float)
    .OP_END_FACTORY_REG(DropOutDoMaskV3D)

 /**
 *@brief Scales the input . \n

@@ -703,7 +928,7 @@ REG_OP(Scale)

 *@par Inputs:
 *One input, including:
 *@li x: A Tensor. Must be 4-D shape, and only support the following types: float16, float32 . \n
 *x: A Tensor. Must be 4-D shape, and only support the following types: float16, float32 . \n

 *@par Attributes:
 *@li depth_radius: An optional int32, specifying the half-width of the normalization window. Defaults to "5".
@@ -960,24 +1185,532 @@ REG_OP(INInferV2D)
    .OP_END_FACTORY_REG(INInferV2D)

 /**
 *@brief Performs instance normalization for inference of InHost part.
 * @brief InstanceNorm operator interface implementation.

 *@par Inputs:\n
 * One input, including: (NC1HWC0 supported)
 * variance: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance.
 * @par Inputs:
 * Three inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li beta: A Tensor. Must be one of the following types: float16, float32.

 * @par Attributes:
 * @li data_format: An attribute of type String \n
 * @li epsilon: An attribute of type Float. \n

 * @par Outputs:
 *Three outputs, including:
 * @li y: A Tensor. Has the same type as "x". \n
 * @li mean: A Tensor. Has the same type as "x". \n
 * @li variance: A Tensor. Has the same type as "x". \n

 * @par Third-party framework compatibility
 * Can be used by onnx InstanceNormalization
 */
 REG_OP(InstanceNorm)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(gamma, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(beta, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(mean, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(variance, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(data_format, String, "NDHWC")
    .ATTR(epsilon, Float, 1e-6)
    .OP_END_FACTORY_REG(InstanceNorm)

 /**
 *@brief InstanceNormGrad operator interface implementation.

 *@par Inputs:
 *Five inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Outputs:
 *Three outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(InstanceNormGrad)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(InstanceNormGrad)

 /**
 *@brief InstanceNormXBackprop operator interface implementation.

 *@par Inputs:
 *Five inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li x: A Tensor. Must be one of the following types: float16, float32.
 * @li variance: A Tensor. Must be one of the following types: float16, float32.
 * @li mean: A Tensor. Must be one of the following types: float16, float32.
 * @li gamma: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Outputs:
 *Two outputs, including:
 * @li pd_x: A Tensor. Must be one of the following types: float16, float32.
 * @li res_for_gamma: A Tensor. Must be one of the following types: float32.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(InstanceNormXBackprop)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(variance, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(res_for_gamma, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(InstanceNormXBackprop)

 /**
 *@brief InstanceNormBetaGammaBackprop operator interface implementation.

 *@par Inputs:
 *Two inputs, including:
 * @li dy: A Tensor. Must be one of the following types: float16, float32.
 * @li res_for_gamma: A Tensor. Must be one of the following types: float32.\n

 *@par Outputs:
 *Two outputs, including:
 * @li pd_gamma: A Tensor. Must be one of the following types: float16, float32.
 * @li pd_beta: A Tensor. Must be one of the following types: float16, float32.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(InstanceNormBetaGammaBackprop)
    .INPUT(dy, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(res_for_gamma, TensorType({DT_FLOAT}))
    .OUTPUT(pd_gamma, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(pd_beta, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(InstanceNormBetaGammaBackprop)

 /**
 * @brief Computes Kl_div_loss_grad or Kl_div_loss_backward. \n

 * @par Inputs:
 * Three inputs, including:
 * @li grad: A Tensor. Must be one of the following types: float16, float32.
 * Required.
 * @li input: A Tensor. Has the same type as "grad". Required.
 * @li target: A Tensor. Has the same type as "grad". Required. \n

 * @par Attributes:
 * @li reduction: An optional attribute of type String. Defaults to "mean". \n
 * @li log_target: An optional attribute of type Bool. Defaults to false. \n

 * @par Outputs:
 * @li y: A Tensor. Has the same type as "grad". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator KlDivLossGrad.
 */
 REG_OP(KlDivLossGrad)
    .INPUT(grad, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(input, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(target, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .ATTR(log_target, Bool, false)
    .OP_END_FACTORY_REG(KlDivLossGrad)

 /**
 * @brief Computes l1_loss_grad or l1_loss_backward. \n

 * @par Inputs:
 * Three inputs, including:
 * @li grads: A Tensor. Must be one of the following types: float16, float32.
 * Required.
 * @li predict: A Tensor. Has the same type as "grads". Required.
 * @li label: A Tensor. Has the same type as "grads". Required. \n

 * @par Attributes:
 * @li reduction: An optional attribute of type String. Defaults to "mean". \n

 * @par Outputs:
 * @li y: A Tensor. Has the same type as "x". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator L1LossGrad.
 */
 REG_OP(L1LossGrad)
    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(label, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(L1LossGrad)

 /**
 * @brief Computes loss of lp, p=1,2,3....

 * @par Inputs:
 * @li predict: An ND tensor of type float16, float32.
 * @li label: An ND tensor of type float16, float32. \n

 * @par Attributes:
 * @li p: A required int attribute that decides which loss to compute, now the p only can be 1 to compute l1_loss.
 * @li reduction: An optional string.Defaults to "mean". \n

 * @par Outputs:
 * @li y: An ND tensor tensor with the same shape and type as "predict". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator LpLoss.
 */
 REG_OP(LpLoss)
    .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(label, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .REQUIRED_ATTR(p, Int)
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(LpLoss)

 /**
 * @brief Computes gradients of mse loss.

 * @par Inputs:
 * @li predict: An ND tensor of type float16, float32.
 * @li label: An ND tensor of type float16, float32.
 * @li dout: An ND tensor of type float16, float32. \n

 * @par Attributes:
 * @li reduction: An optional string.Defaults to "mean". \n

 * @par Outputs:
 * @li y: An ND tensor tensor with the same shape and type as "predict". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator MseLossGrad.
 */
 REG_OP(MseLossGrad)
    .INPUT(predict, TensorType({DT_FLOAT32, DT_FLOAT16}))
    .INPUT(label, TensorType({DT_FLOAT32, DT_FLOAT16}))
    .INPUT(dout, TensorType({DT_FLOAT32, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT32, DT_FLOAT16}))
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(MseLossGrad)

 /**
 * @brief Computes mse loss.
 * @par Inputs:
 * two inputs, including:
 *  @li predict: An ND Tensor of dtype float16 or float32.
 *  @li label: An ND Tensor of dtype float16 or float32.\n
 *
 * @par Attributes:
 *  @li reduction:An optional str from sum, none, mean, Defaults to "mean".\n
 *
 * @par Outputs:
 *  @li y: when reduction=sum/mean, y is scale. when reduction=none, y has
 *    same type and shape as "predict".\n
 */
 REG_OP(MseLoss)
    .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(label, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(MseLoss)

 /**
 * @brief Calculates the reversed outputs of the function "smooth_l1_loss_v2". \n

 * @par Inputs:
 * Three Inputs, including:
 * @li predict: A Tensor. Must be one of the following types:
 *     float16, float32.
 * @li label: A Tensor. Has the same type as "predict".
 * @li dout: A Tensor. Has the same type as "predict". \n

 * @par Attributes:
 * Two Attributes, including:
 * @li sigma: An optional float. Defaults to 1.0. \n

 * @li reduction: An optional string. Defaults to "mean",
 *    Must be one of the following: "none", "mean", "sum". \n

 * @par Outputs:
 * @li gradient: A Tensor. Has the same type as "predict". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator SmoothL1LossBackward.
 */
 REG_OP(SmoothL1LossGradV2)
    .INPUT(predict, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(label, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(dout, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(gradient, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(sigma, Float, 1.0)
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(SmoothL1LossGradV2)

 /**
 * @brief Creates a criterion that uses a squared term if the absolute
 * element-wise error falls below beta and an L1 term otherwise. It is
 * less sensitive to outliers than the MSELoss and in some cases prevents
 * exploding gradients.

 * @par Inputs:
 * @li predict: A multi-dimensional Tensor of type float16 or float32,
 * specifying the predictive value. \n
 * @li label: A multi-dimensional Tensor of type float16 or float32,
 * specifying the target value. \n

 * @par Attributes:
 * @li sigma: An optional int. Specifies the threshold of loss. Defaults
 * to "1.0". \n
 * @li reduction: An optional str. Specifies the reduction to apply to
 * the output: 'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
 * 'mean': the sum of the output will be divided by the number of elements in
 * the output,'sum': the output will be summed. Default: 'mean'. \n

 * @par Outputs:
 * @li loss: Indicates the loss between the predictive value and target value.
 * Has the same dimensions as "predict". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator smooth_l1_loss. \n
 */
 REG_OP(SmoothL1LossV2)
    .INPUT(predict, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(label, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .OUTPUT(loss, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .ATTR(sigma, Float, 1.0)
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(SmoothL1LossV2)

 /**
 * @brief Computes Centralization. result = x - mean(x, axes)

 * @par Inputs:
 * @li x: An ND tensor of type float16, float32.
 * @par Attributes:
 * @li axes: The dimensions to reduce. Must be one of the following types: int, list, tuple, NoneType.
 * Must be in the range [-rank(x), rank(x)).
 * @par Outputs:
 * @li y: A Tensor. Has the same type as "x". \n

 * @par Third-party framework compatibility
 * custom operator \n
 */
 REG_OP(Centralization)
    .INPUT(x, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .OUTPUT(y, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .ATTR(axes, ListInt, {-1})
    .OP_END_FACTORY_REG(Centralization)

 /**
 *@brief Roll the tensor along the given dimension(s).
 * Elements that are shifted beyond the last position are re-introduced at the first position.
 * If a dimension is not specified, the tensor will be flattened before rolling and then restored to the original shape. \n

 *@par Inputs:
 *One inputs, including:
 * @li x: A tensor . Must be one of the following types:
 *     float16, float32, int32, uint32, int8, uint8. \n

 *@par Attributes:
 * epsilon: An optional float32, specifying the small value added to
 variance to avoid dividing by zero. Defaults to "0.00001" . \n
 * @li shifts: The number of places by which the elements of the tensor are shifted. \n
 * @li dims: Axis along which to roll. \n

 *@par Outputs:\n
 * variance_sqrt: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance_sqrt.
 *@par Outputs:
 * y: A Tensor with the same type and shape of x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Roll. \n
 */
 REG_OP(InHost)
     .INPUT(variance, TensorType({DT_FLOAT}))
     .OUTPUT(variance_sqrt, TensorType({DT_FLOAT}))
     .ATTR(epsilon, Float, 0.00001)
     .OP_END_FACTORY_REG(InHost)
 REG_OP(Roll)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_UINT32,DT_INT8,DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT32,DT_UINT32,DT_INT8,DT_UINT8}))
    .REQUIRED_ATTR(shifts, ListInt)
    .ATTR(dims, ListInt, {})
    .OP_END_FACTORY_REG(Roll)

 /**
 *@brief Calculate the loss. Creates a criterion that optimizes a two-class classification
 logistic loss between input_x and input_y (containing 1 or -1). \n

 *@par Inputs:
 *One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * @li input_y: A tensor. Must be one of the following types:
 *     float16, float32. \n

 *@par Attributes:
 *@li lambd: An optional string.Defaults to "mean". \n

 *@par Outputs:
 *output_z: while reduction == "none", A Tensor with the same type and shape of input_x's. \n
 *          while reduction == "sum" or "mean", A Tensor with the same type of input_x , shape of which is (1,)

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator SoftMarginLoss. \n
 */
 REG_OP(SoftMarginLoss)
    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(input_y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(reduction, String, "mean")
    .OUTPUT(output_z, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OP_END_FACTORY_REG(SoftMarginLoss)

 /**
 * @brief Computes gradients of sigmoid_cross_entropy_with_logits_v2.

 * @par Inputs:
 * @li predict: An ND tensor of type float16, float32.
 * @li target: An ND tensor of type float16, float32.
 * @li dout: An ND tensor of type float16, float32.
 * @li weight: An optional ND tensor of type float16, float32.
 * @li pos_weight: An optional ND tensor of type float16, float32. \n

 * @par Attributes:
 * @li reduction: An optional string.Defaults to "mean". \n

 * @par Outputs:
 * @li gradient: An ND tensor tensor with the same shape and type as "predict". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator SigmoidCrossEntropyWithLogitsGrad.
 */
 REG_OP(SigmoidCrossEntropyWithLogitsGradV2)
    .INPUT(predict, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(target, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(dout, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(pos_weight, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(gradient, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogitsGradV2)
 /**
 * @brief Calculate the PoissonNllLoss function. 
 *        target∼Poisson(input)loss(input,target)=input−target∗log(input)+log(target!) \n

 * @par Inputs:
 * Two inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * 
 * @par Inputs:
 * @li target: A tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * four Attributes, including:
 * @li log_input: An optional bool. Defaults to "True" \n
 * 
 *  @par Attributes:
 * @li full: An optional bool. Defaults to "False" \n
 * 
 *  @par Attributes:
 * @li eps: An optional float. Defaults to "1e-8" \n
 * 
 *  @par Attributes:
 * @li reduction: An optional string. Defaults to "mean" \n

 * @par Outputs:
 * loss: A Tensor has same element type as two inputs. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator PoissonNllLoss. \n
 */
 REG_OP(PoissonNllLoss)
    .INPUT(input_x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(target, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(loss, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(log_input, Bool, true)
    .ATTR(full, Bool, false)
    .ATTR(eps, Float, 1e-8)
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(PoissonNllLoss)
 /**
 *@brief rnn_gen_mask
 * @par Inputs:
 * @li seq_length: A ND Tensor of type int32. Recoed the current length of each batch.\n
 *
 * @par Attributes:
 * @li num_step: A required int.\n
 * @li hidden_size: A required int. \n
 *
 * 
 * @par Output:
 * y: A mutable Tensor of type float16, with the shape of [num_step, batch_size, hidden_size]. \n
 *
 */
 REG_OP(RnnGenMask)
    .INPUT(seq_length, TensorType({DT_INT32}))
    .OUTPUT(seq_mask, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(num_step, Int)
    .REQUIRED_ATTR(hidden_size, Int)
    .OP_END_FACTORY_REG(RnnGenMask)

 /**
 * @brief Creates a criterion that optimizes a multi-class multi-classification hinge loss (margin-based loss) 
 *        between input x (a 2D mini-batch Tensor) and output y (which is a 2D Tensor of target class indices) \n
 
 * @par Inputs:
 * Two inputs, including:
 * @li x: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * 
 * @par Inputs:
 * @li target: A tensor. Must be the following types:
 *     int32. \n

 * @par Attributes:
 * @li reduction: An optional string. Defaults to "mean" \n

 * @par Outputs:
 * y: A Tensor has same element type as input x. \n
 * is_target: A Tensor has same element type as input target. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator MultiLabelMarginLoss. \n
 */
 REG_OP(MultilabelMarginLoss)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(target, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(is_target, TensorType({DT_INT32}))
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(MultilabelMarginLoss)

 /**
 *@brief Performs batch normalization . \n
 *@par Inputs:
 * Two inputs
 *@li input_x: A Tensor. Support float32. shape (n, c, d).
 *@li seq_len: A Tensor. Each batch normalize data num. Support Int32. Shape (n, ). \n
 *@par Attributes:
 *@li normalize_type: Str. Support "per_feature" or "all_features".
 *@li epsilon: An optional float32, specifying the small value added to
 variance to avoid dividing by zero. Defaults to "0.00001" . \n
 *@par Outputs:
 * One outputs
 *@li output_y: A Tensor for the normalized "x".Support float32. shape (n, c, d).\n
 */
 REG_OP(NormalizeBatch)
    .INPUT(input_x, TensorType({ DT_FLOAT }))
    .INPUT(seq_len, TensorType({ DT_INT32 }))
    .OUTPUT(output_y, TensorType({ DT_FLOAT }))
    .REQUIRED_ATTR(normalize_type, String)
    .ATTR(epsilon, Float, 0.00001)
    .OP_END_FACTORY_REG(NormalizeBatch)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_NORM_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,7 +20,144 @@
 */
 #ifndef OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
 #define OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_

 #include "graph/operator_reg.h"
 #include "nn_pooling_ops.h"

 namespace ge {
 /**
 * @brief Says whether the targets are in the top "k" predictions . \n

 * @par Inputs:
 * Three inputs, including:
 * @li predictions: A 2D Tensor of type float32. A "batch_size * classes" tensor.
 * @li targets: A 1D Tensor of type IndexNumberType. A batch_size tensor of class ids.
 * @li k: A 1D Tensor of the same type as "targets".
 * Specifies the number of top elements to look at for computing precision . \n

 * @par Outputs:
 * precision: A Tensor of type bool . \n

 * @attention Constraints:
 * @li targets must be non-negative tensor.

 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator InTopKV2.
 */
 REG_OP(InTopKV2)
    .INPUT(predictions, TensorType({DT_FLOAT}))
    .INPUT(targets, TensorType(IndexNumberType))
    .INPUT(k, TensorType({IndexNumberType}))
    .OUTPUT(precision, TensorType({DT_BOOL}))
    .OP_END_FACTORY_REG(InTopKV2)

 /**
 *@brief Performs batch normalization . \n

 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the offset.
 *@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
 operation is used for training.
 *@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be
 5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
 if the operation is used for training . \n

 *@par Attributes:
 *@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001".
 *@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
 *@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n

 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
 *@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
 *@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean of "x".
 *@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
 *@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n

 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
 then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has the same value as "variance".
 *@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n
 */
 REG_OP(FusedBatchNormV2)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(scale, TensorType({DT_FLOAT}))
    .INPUT(offset, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(mean, TensorType({DT_FLOAT}))
    .OPTIONAL_INPUT(variance, TensorType({DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(batch_mean, TensorType({DT_FLOAT}))
    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_1, TensorType({DT_FLOAT}))
    .OUTPUT(reserve_space_2, TensorType({DT_FLOAT}))
    .ATTR(epsilon, Float, 0.0001)
    .ATTR(data_format, String, "NHWC")
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(FusedBatchNormV2)

 /**
 * @brief: Large amount of data sort.First operator of TopK.
 * @par Inputs:
 * two input, including:
 * @li input_data: A Tensor. Data to be sorted. Support float16
 * @li input_index: A Tensor. Range(0, 2048). Datatype and format is same as input_data.
 * @par Attributes:
 * @li k_num: Int.Number to be sorted.
 * @par Outputs:
 * 1 output, including:
 * @li output_proposal: A Tensor. Datatype and format is same as input_data. Proposal sorted for each channel.
 */
 REG_OP(SegmentSort)
    .INPUT(input_data, TensorType({DT_FLOAT16}))
    .INPUT(input_index, TensorType({DT_FLOAT16}))
    .OUTPUT(output_proposal, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(k_num, Int)
    .OP_END_FACTORY_REG(SegmentSort)

 /**
 * @brief: Large amount of data sort.Second operator of TopK.
 * @par Inputs:
 * two input, including:
 * @li input_proposal: A Tensor. Proposal sorted for each channel. Support float16
 * @par Attributes:
 * @li k_num: Int.Number to be sorted.
 * @par Outputs:
 * 1 output, including:
 * @li output_proposal: A Tensor. Datatype and format is same as input_data. Proposal sorted for each channel.
 */
 REG_OP(MultiMerge)
    .INPUT(input_proposal, TensorType({DT_FLOAT16}))
    .OUTPUT(output_proposal, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(k_num, Int)
    .OP_END_FACTORY_REG(MultiMerge)

 /**
 * @brief: Large amount of data sort.Third operator of TopK.
 * @par Inputs:
 * two input, including:
 * @li input_proposal: A Tensor. Proposal sorted for each channel. Support float16
 * @par Attributes:
 * @li k_num: Int.Number to be sorted.
 * @par Outputs:
 * 2 output, including:
 * @li output_data: A Tensor. Datatype and format is same as input_data. Data sorted.
 * @li output_index: A Tensor. int32. Data index.
 */
 REG_OP(SingleMerge)
    .INPUT(input_proposal, TensorType({DT_FLOAT16}))
    .OUTPUT(output_data, TensorType({DT_FLOAT16}))
    .OUTPUT(output_index, TensorType({DT_INT32}))
    .REQUIRED_ATTR(k_num, Int)
    .OP_END_FACTORY_REG(SingleMerge)
 }// namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -182,6 +182,128 @@ REG_OP(AvgPool3D)
    .ATTR(data_format, String, "NDHWC")
    .OP_END_FACTORY_REG(AvgPool3D)


 /**
 *@brief Performs average pooling on the input.

 *@par Inputs:
 *@li x: A 5-D Tensor of shape [batch, depth, height, width, channels] and type float16, float32, double.
 *@li filter: An optional tensor of type float16, float32, double, fractal_z_3d layout.
 *@li multiplier: An optional tensor of float16, float32, double.

 *@par Attributes:
 *@li ksize: List of ints that has length 1, 3 or 5. The size of the window for each dimension of the input tensor.
 *@li strides:List of ints that has length 1, 3 or 5. The stride of the sliding window for each dimension of the input tensor.
 *@li pads: List of ints, implicit zero paddings on both sides of the input.
 *@li ceil_mode: When true, will use ceil instead of floor in the formula to compute the output shape.
 *@li count_include_pad: When true, will include the zero-padding in the averaging calculation.
 *@li divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used.
 *@li data_format: A string, format of input data . \n

 *@par Outputs:
 *y: The average pooled output tensor . \n

 *@attention Constraints:
 *@li "ksize" is in the range [1, 255]. "strides" is in the range [1, 63]

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator AvgPool3D.
 */
 REG_OP(AvgPool3DD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .OPTIONAL_INPUT(filter, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .OPTIONAL_INPUT(multiplier, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(ceil_mode, Bool, false)
    .ATTR(count_include_pad, Bool, true)
    .ATTR(divisor_override, Int, 0)
    .ATTR(data_format, String, "NDHWC")
    .OP_END_FACTORY_REG(AvgPool3DD)

 /**
 * @brief Computes AvgPool3DGrad function.

 * @par Inputs:
 * @li orig_input_shape: An NDHWC tensor of type int32.
 * @li grads: An NDHWC tensor of type float16, float32, or double.

 * @par Attributes:
 * @li ksize: List of ints that has length 5. The size of the window for each dimension of the input tensor.
 * @li strides:List of ints that has length 5. The stride of the sliding window for each dimension of the input tensor.
 * @li pads: List of ints, implicit zero paddings on both sides of the input.
 * @li ceil_mode: When true, will use ceil instead of floor in the formula to compute the output shape.
 * @li count_include_pad: When true, will include the zero-padding in the averaging calculation.
 * @li divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used.
 * @li data_format: A string, format of input data.

 * @par Outputs:
 * @output: A mutable tensor with the same shape and type as "orig_input_shape".

 * @attention Constraints:
 * @li "ksize" is in the range [1, 255]. "strides" is in the range [1, 63]

 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator AvgPoolGrad.
 */

 REG_OP(AvgPool3DGrad)
    .INPUT(orig_input_shape, TensorType({DT_INT32}))
    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .OUTPUT(output, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(ceil_mode, Bool, false)
    .ATTR(count_include_pad, Bool, true)
    .ATTR(divisor_override, Int, 0)
    .ATTR(data_format, String, "NDHWC")
    .OP_END_FACTORY_REG(AvgPool3DGrad)

 /**
 * @brief Performs average pooling on the input.

 * @par Inputs:
 * @li grads: An NDHWC tensor of type float16.
 * @li filter: An optional tensor of type float16, fractal_z_3d layout.
 * @li multiplier: An optional tensor of float16.

 * @par Attributes:
 * @li orig_input_shape: List of ints that has length 5. The size of the window for each dimension of the input tensor.
 * @li ksize: List of ints that has length 5. The size of the window for each dimension of the input tensor.
 * @li strides:List of ints that has length 5. The stride of the sliding window for each dimension of the input tensor.
 * @li pads: List of ints, implicit zero paddings on both sides of the input.
 * @li ceil_mode: When true, will use ceil instead of floor in the formula to compute the output shape.
 * @li count_include_pad: When true, will include the zero-padding in the averaging calculation.
 * @li divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used.
 * @li data_format: A string, format of input data . \n

 * @par Outputs:
 * @output: The average pooled output tensor . \n

 * @attention Constraints:
 * @li "ksize" is in the range [1, 255]. "strides" is in the range [1, 63]

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator AvgPool3DGradD.
 */
 REG_OP(AvgPool3DGradD)
    .INPUT(grads, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(filter, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(multiplier, TensorType({DT_FLOAT16}))
    .OUTPUT(output, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(orig_input_shape, ListInt)
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(ceil_mode, Bool, false)
    .ATTR(count_include_pad, Bool, true)
    .ATTR(divisor_override, Int, 0)
    .ATTR(data_format, String, "NDHWC")
    .OP_END_FACTORY_REG(AvgPool3DGradD)

 /**
 *@brief Performs max_pool_ext2 on the input . \n

@@ -278,8 +400,8 @@ No default value.
 specifying the stride of the sliding window for each dimension of
 the input tensor. No default value.
 *@li padding: A required string type of float16.
 *@li pads: A list type of int32. Default value {0, 0, 0}.
 *@li dilation: A list type of int32. Default value {1, 1, 1}.
 *@li pads: A list type of int32. Default value {0,0,0,0,0,0}.
 *@li dilation: A list type of int32. Default value {1,1,1,1,1,1}.
 *@li ceil_mode: A ceil mode number of int32 . Default value 0.
 *@li data_format: An optional string. Defaults to "NDHWC" . \n

@@ -302,12 +424,37 @@ REG_OP(MaxPool3D)
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(padding, String)
    .ATTR(pads, ListInt, {0,0,0})
    .ATTR(dilation, ListInt, {1,1,1})
    .ATTR(pads, ListInt, {0,0,0,0,0,0})
    .ATTR(dilation, ListInt, {1,1,1,1,1,1})
    .ATTR(ceil_mode, Int, 0)
    .ATTR(data_format, String, "NDHWC")
    .OP_END_FACTORY_REG(MaxPool3D)

 /**
 *@brief Applies a 2D adaptive max pooling over an input signal conposed of several input planes. \n
 * The output is of size H x W, for any input size. 

 * @par Inputs:
 * One input, including:
 * @li x: A Tensor. Must be one of the following data types:
 *     float16, float32, float64. \n

 * @par Attributes:
 * @li output_size: A required list of 2 ints
 *    specifying the size (H,W) of the output tensor. \n

 * @par Outputs:
 * @li y: A Tensor. Has the same data type as "x" \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator AdaptiveMaxPool2d.
 */
 REG_OP(AdaptiveMaxPool2d)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
    .OUTPUT(argmax, TensorType::IndexNumberType())
    .REQUIRED_ATTR(output_size, ListInt)
    .OP_END_FACTORY_REG(AdaptiveMaxPool2d)

 /**
 * @brief Computes second-order gradients of the maxpooling3d function . \n
@@ -477,8 +624,9 @@ REG_OP(MaxPoolV2)

 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor. Supported type: float, double, int32,
 * uint8, int16, int8, int64, uint16, half, uint32, uint64 . \n
 * x: An 4D Tensor. Supported type: float, double, int32,
 * uint8, int16, int8, int64, uint16, half, uint32, uint64.
 * Must set the format, supported format list ["NCHW, NHWC"]. \n

 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values,
@@ -490,8 +638,8 @@ REG_OP(MaxPoolV2)
 *@li padding: A required string. No default value . \n

 *@par Outputs:
 *y: A Tensor. Has the same type and format as input "x".
 *argmax: A Tensor. Has the same type and format as input "x".
 *@li y: A Tensor. Has the same type and format as input "x".
 *@li argmax: A Tensor. Has the same type and format as input "x".
 *@attention Constraints:
 *@li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1,
 * ksize[1] * ksize[2] <= 255.
@@ -517,10 +665,12 @@ REG_OP(MaxPoolWithArgmax)

 *@par Inputs:
 * Three inputs, including:
 *@li x: An NC1HWC0 tensor. Supported type: float, double, int32,
 *@li x: An 4d tensor. Supported type: float, double, int32,
 * uint8, int16, int8, int64, uint16, half, uint32, uint64.
 *@li grad: An NC1HWC0 tensor. Supported type: float, double, int32,
 * Must set the format, supported format list ["NCHW, NHWC"]
 *@li grad: An 4d tensor. Supported type: float, double, int32,
 * uint8, int16, int8, int64, uint16, half, uint32, uint64.
 * Must set the format, supported format list ["NCHW, NHWC"]
 *@li argmx: An NC1HWC0 tensor of type int32 or int64 . \n

 *@par Attributes:
@@ -741,7 +891,7 @@ REG_OP(AvgPoolV2Grad)
 * @brief Computes gradients of averagev2 pooling function.

 * @par Inputs:
 * @li input_grad: An NHWC tensor of type float16, float32, or double.
 *input_grad: An NHWC tensor of type float16, float32, or double.

 * @par Attributes:
 * @li orig_input_shape: A required tuple or list of type int32.
@@ -759,10 +909,10 @@ REG_OP(AvgPoolV2Grad)
 * @li data_format: An optional string. Defaults to "NHWC".

 * @par Outputs:
 * @out_grad: A mutable tensor with the same shape and type as "orig_input".
 *out_grad: A mutable tensor with the same shape and type as "orig_input".

 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator AvgPoolGrad.
 *Compatible with the TensorFlow operator AvgPoolGrad.
 */
 REG_OP(AvgPoolV2GradD)
    .INPUT(input_grad, TensorType({DT_FLOAT16}))
@@ -1037,6 +1187,7 @@ REG_OP(MaxPool3DGrad)
    .OUTPUT(y, TensorType::RealNumberType())
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .ATTR(padding, String, "SAME")
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(data_format, String, "NDHWC")
    .OP_END_FACTORY_REG(MaxPool3DGrad)
@@ -1107,7 +1258,7 @@ REG_OP(AvgPool1DD)

 *@par Inputs:
 * One input:
 *x: An NC1HWC0 Tensor of type float16.
 *x: An 4d Tensor of type float16. Must set the format, supported format list ["NCHW, NHWC"].
 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for
 * each dimension of the input tensor. No default value.
@@ -1148,9 +1299,9 @@ REG_OP(MaxPoolWithArgmaxV2)

 *@par Inputs:
 * Three inputs, including:
 *@li x: An NC1HWC0 tensor of type float16.
 *@li grad: An NC1HWC0 tensor of type float16.
 *@li argmx: An NC1HWC0 tensor of type uint16 or int64 . \n
 *@li x: An 4d tensor of type float16. Must set the format, supported format list ["NCHW, NHWC"]
 *@li grad: An 4d tensor of type float16. Must set the format, supported format list ["NCHW, NHWC"]
 *@li argmx: An 4d tensor of type uint16 or int64. Must set the format, supported format list ["NCHW, NHWC"] \n

 *@par Attributes:
 *@li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for
@@ -1291,5 +1442,306 @@ REG_OP(MaxPoolV3Grad)
    .ATTR(global_pooling, Bool, false)
    .ATTR(ceil_mode, Bool, false)
    .OP_END_FACTORY_REG(MaxPoolV3Grad)

 /**
 *@brief Performs Dilation2D on the input . \n

 *@par Inputs:
 *x: A tensor of shape is 4d, format is support NHWC.
 *filter: A tensor of shape is 3d, the type is same with x, and the c dimension is same with x. \n

 *@par Attributes:
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimensions are 1.
 *@li rates: A required list of 4 ints. The rates of the N and C dimensions are 1.
 *@li padding_mode: A optional string. Defaults to "SAME", it support SAME and VALID.
 *@li pads: An optional list of 4 ints.
 *@li ceil_mode: An optional bool. Defaults to "false". Use ceil or floor to calculate the output size when padding_mode is "CALCULATED".
 *@li data_format: An optional string, specifying the data format of "rates" and "strides", either "NCHW" or "NHWC" (default). \n

 *@par Outputs:
 *y: The output tensor. Has the same type and format as input "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Dilation2D.
 */
 REG_OP(Dilation2D)
    .INPUT(x,TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .INPUT(filter,TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .OUTPUT(y,TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(rates, ListInt)
    .ATTR(padding_mode, String, "SAME")
    .ATTR(pads, ListInt, {0,0,0,0})
    .ATTR(ceil_mode, Bool, false)
    .ATTR(data_format, String, "NHWC")
    .OP_END_FACTORY_REG(Dilation2D)

 /**
 *@brief Performs Dilation2DBackpropFilter on the input. \n

 *@par Inputs:
 *x: A tensor of shape is 4d, format is support NHWC.
 *filter: A tensor of shape is 3d, the type is same with x, and the c dimension is same with x.
 *out_backprop: Has the same type and format as input x and the c dimension is same with x. \n

 *@par Attributes
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimension are 1.
 *@li rates: A required list of 4 ints, the rates of the N and C dimensions are 1.
 *@li padding_mode: A optional string. Defaults to "SAME", it support SAME and VALID.
 *@li pads: A optional list of 4 ints.
 *@li ceil_mode: An optional bool. Defaults to "false". Use ceil or floor to calculate the output size when padding_mode is "CALCULATED".
 *@li data_format: An optional string, specifying the data format of "rates" and "strides", either "NCHW" or "NHWC" (default). \n

 *@par Outputs:
 *y: The output tensor. Has the same type and format as input "filter" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Dilation2DBackpropFilter.
 */

 REG_OP(Dilation2DBackpropFilter)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .INPUT(filter,
           TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .INPUT(out_backprop,
           TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .OUTPUT(y,
            TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(rates, ListInt)
    .ATTR(padding_mode, String, "SAME")
    .ATTR(pads, ListInt, {0, 0, 0, 0})
    .ATTR(ceil_mode, Bool, false)
    .ATTR(data_format, String, "NHWC")
    .OP_END_FACTORY_REG(Dilation2DBackpropFilter)

 /**
 *@brief Performs Dilation2DBackpropInput on the input. \n

 *@par Inputs:
 *x: A tensor of shape is 4d, format is support NHWC.
 *filter: A tensor of shape is 3d, the type is same with x, and the c dimension is same with x.
 *out_backprop: Has the same type and format as input x and the c dimension is same with x. \n

 *@par Attributes
 *@li strides: A required list of 4 ints, specifying the stride of the sliding window. The strides of the N and C dimension are 1.
 *@li rates: A required list of 4 ints, the rates of the N and C dimensions are 1.
 *@li padding_mode: A optional string. Defaults to "SAME", it support SAME and VALID.
 *@li pads: A optional list of 4 ints.
 *@li ceil_mode: An optional bool. Defaults to "false". Use ceil or floor to calculate the output size when padding_mode is "CALCULATED".
 *@li data_format: An optional string, specifying the data format of "rates" and "strides", either "NCHW" or "NHWC" (default). \n

 *@par Outputs:
 *y: The output tensor. Has the same type and format as input "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Dilation2DBackpropInput.
 */

 REG_OP(Dilation2DBackpropInput)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .INPUT(filter,
           TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .INPUT(out_backprop,
           TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .OUTPUT(y,
            TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_UINT8, DT_INT16, DT_INT8, DT_UINT16}))
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(rates, ListInt)
    .ATTR(padding_mode, String, "SAME")
    .ATTR(pads, ListInt, {0, 0, 0, 0})
    .ATTR(ceil_mode, Bool, false)
    .ATTR(data_format, String, "NHWC")
    .OP_END_FACTORY_REG(Dilation2DBackpropInput)

 /**
 * @brief Applies a 2D adaptive average pooling over  
 *       an input signal composed of several input planes.  \n

 * @par Inputs:
 * One input, including:
 * @li x: A Tensor. Must be one of the following data types:
 *     float16, float32. \n

 * @par Attributes:
 * @li output_size: A required list of 2 ints
 *    specifying the size (H,W) of the output tensor. \n

 * @par Outputs:
 * @li y: A Tensor. Has the same data type as "x" \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator AdaptiveAvgPool2d.
 */
 REG_OP(AdaptiveAvgPool2d)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(output_size, ListInt)
    .OP_END_FACTORY_REG(AdaptiveAvgPool2d)

 /**
 * @brief Compute gradients of adaptive averagev2 pooling function.

 * @par Inputs:
 * @li input_grad: A Tensor. Must be one of the following data types:
 * float16, float32.

 * @par Attributes:
 * @li orig_input_shape: A required tuple or list of type int32.

 * @par Outputs:
 * @li output_grad: A tensor with the same type as "input_grad".

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator AdaptiveAvgPool2dGrad.
 */
 REG_OP(AdaptiveAvgPool2dGrad)
    .INPUT(input_grad, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(output_grad, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(orig_input_shape, ListInt)
    .OP_END_FACTORY_REG(AdaptiveAvgPool2dGrad)

 /**
 * @brief Performs the backpropagation of MaxPoolWithGradArgmaxV1.

 * @par Inputs:
 * Three inputs, including:
 * @li x: An NC1HWC0 tensor of type float16.
 * @li grad: An NC1HWC0 tensor of type float16.
 * @li argmax: An NC1HWC0 tensor of type uint16 or int64. \n

 * @par Attributes:
 * @li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for
 * each dimension of the input tensor. No default value.
 * @li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for
 * each dimension of the input tensor. No default value.
 * @li pads: A required listint. \n

 * @par Outputs:
 * y: A Tensor. Has the same type and format as input "x". \n

 * @attention Constraints:
 * @li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 * @li "strides" is a list that has length 4: strides[0] = 1 or strides[3] = 1
 * @li "pads" is listint.
 * @li "ceil_mode" defaults to False.
 * @li "data_format" defaults to "NC1HWC0". \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator MaxPoolGradWithArgmaxV1.
 */

 REG_OP(MaxPoolGradWithArgmaxV1)
    .INPUT(x, TensorType({DT_FLOAT16}))
    .INPUT(grad, TensorType({DT_FLOAT16}))
    .INPUT(argmax, TensorType({DT_UINT16}))
    .OUTPUT(y, TensorType({DT_FLOAT16}))
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(dtype, Int, 3)
    .ATTR(dilation, ListInt, {1, 1, 1, 1})
    .ATTR(ceil_mode, Bool, false)
    .OP_END_FACTORY_REG(MaxPoolGradWithArgmaxV1)

 /**
 * @brief Performs max pooling on the input and outputs both max values and indices.

 * @par Inputs:
 * One input:
 * x: An NC1HWC0 Tensor of type float16. \n

 * @par Attributes:
 * @li ksize: A required list of int8, int16, int32, or int64 values, specifying the size of the window for
 * each dimension of the input tensor. No default value.
 * @li strides: A required list of int8, int16, int32, or int64 values, specifying the stride of the sliding window for
 * each dimension of the input tensor. No default value.
 * @li pads: A required string. No default value. \n

 * @par Outputs:
 * y: A Tensor. Has the same type and format as input "x".
 * argmax:  A Tensor. type:uint16, format:NC1HWC0. \n

 * @attention Constraints:
 * @li "ksize" is a list that has length 4: ksize[0] = 1 or ksize[3] = 1, ksize[1] * ksize[2] <= 255.
 * @li "stride is a list that has length 4: strides[0] = 1 or strides[3] = 1, strides[1] <= 63, strides[0] >= 1,
 * strides[2] <= 63, strides[2] >= 1.
 * @li "pads" is listint.
 * @li "ceil_mode" defaults to False.
 * @li "data_format" defaults to "NC1HWC0". \n

 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator MaxPoolWithArgmaxV1.
 */
 REG_OP(MaxPoolWithArgmaxV1)
    .INPUT(x, TensorType({DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT16}))
    .OUTPUT(argmax, TensorType({DT_UINT16}))
    .REQUIRED_ATTR(ksize, ListInt)
    .REQUIRED_ATTR(strides, ListInt)
    .REQUIRED_ATTR(pads, ListInt)
    .ATTR(dtype, Int, 3)
    .ATTR(dilation, ListInt, {1, 1, 1, 1})
    .ATTR(ceil_mode, Bool, false)
    .OP_END_FACTORY_REG(MaxPoolWithArgmaxV1)

 /**
 *@brief Randomly sample a subset of positive and negative examples,and overwrite
 the label vector to the ignore value (-1) for all elements that are not
 included in the sample.\n

 * @par Inputs:
 * One input:
 * labels: shape of labels,(N, ) label vector with values. \n

 * @par Attributes:
 * @li batch_size_per_images: A require attribute of type int.
 * @li positive_fraction: A require attribute of type float.

 *@par Outputs:
 *y: The result of subSample. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator SubSample.
 *@par Restrictions:
 *Warning: This operator can be integrated only by MaskRcnn. Please do not use it directly.
 */
 REG_OP(SubSample)
    .INPUT(labels, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT32}))
    .REQUIRED_ATTR(batch_size_per_images, Int)
    .REQUIRED_ATTR(positive_fraction, Float)
    .OP_END_FACTORY_REG(SubSample)

 /**
 *@brief Randomly sample a subset of positive and negative examples,and overwrite
 the label vector to the ignore value (-1) for all elements that are not
 included in the sample.\n

 * @par Inputs:
 * two inputs, including:
 * @li labels: shape of labels,(N, ) label vector with values:.
 * @li shuffle_matrix: random matrix with shape (N, ). \n

 * @par Attributes:
 * @li batch_size_per_images: A require attribute of type int.
 * @li positive_fraction: A require attribute of type float.

 *@par Outputs:
 *y: The result of subSample. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator SubSampleLabels.
 *@par Restrictions:
 *Warning: This operator can be integrated only by MaskRcnn. Please do not use it directly.
 */
 REG_OP(SubSampleLabels)
    .INPUT(labels, TensorType({DT_INT32}))
    .INPUT(shuffle_matrix, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT32}))
    .REQUIRED_ATTR(batch_size_per_images, Int)
    .REQUIRED_ATTR(positive_fraction, Float)
    .OP_END_FACTORY_REG(SubSampleLabels)

 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_POOLING_OPS_H
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -2101,6 +2101,55 @@ REG_OP(FusedMulApplyMomentumExtern)
    .ATTR(use_locking, Bool, false)
    .OP_END_FACTORY_REG(FusedMulApplyMomentumExtern)

 /**
 *@brief Updates '*var' according to the momentum scheme.
 *   accum = accum * momentum - x1 * x2 * lr
 *   if use_nesterov is True:
 *       var += accum * momentum - x1 * x2 * lr
 *   else:
 *       var += accum
 *
 *@par Inputs:
 *@li var: A mutable tensor. Must be one of the data types defined in
 *    TensorType::NumberType(). Should be from a Variable().
 *@li accum: A mutable tensor. Has the same type as "var". Should be from a
 *    Variable().
 *@li lr: A tensor for the learning rate. Has the same type as "var". Should be
 *    from a Variable().
 *@li x1: A Tensor has type TensorType::NumberType().
 *@li momentum: A scalar. Has the same type as "var".
 *@li x2: A scalar has the same type as "var".
 *
 *@par Attributes:
 *@li use_nesterov: An optional bool. Defaults to "False".
 *    If "True", var will be updated by using Nesterov momentum.
 *@li use_locking: An optional bool. Defaults to "False".
 *    If "True", updating of the "var" tensor is protected by a lock;
 *    otherwise the behavior is undefined, but may exhibit less contention.
 *
 *@par Outputs:
 * var: A mutable tensor. Has the same type as input "var".
 *
 *@attention Constraints:
 * The input tensors must have the same shape.
 *
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ResourceApplyKerasMomentum.
 *
 */
 REG_OP(FusedMulApplyKerasMomentum)
    .INPUT(var, TensorType::NumberType())
    .INPUT(accum, TensorType::NumberType())
    .INPUT(lr, TensorType::NumberType())
    .INPUT(x1, TensorType::NumberType())
    .INPUT(momentum, TensorType::NumberType())
    .INPUT(x2, TensorType::NumberType())
    .OUTPUT(var, TensorType::NumberType())
    .OUTPUT(accum, TensorType::NumberType())
    .ATTR(use_locking, Bool, false)
    .ATTR(use_nesterov, Bool, false)
    .OP_END_FACTORY_REG(FusedMulApplyKerasMomentum)

 /**
 *@brief Update "g" according to the LARS algorithm . \n

--- a/third_party/fwkacllib/inc/ops/no_op.h
+++ b/third_party/fwkacllib/inc/ops/no_op.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -223,7 +223,29 @@ REG_OP(Relu6Grad)
    .INPUT(features, TensorType::RealNumberType())
    .OUTPUT(backprops, TensorType::RealNumberType())
    .OP_END_FACTORY_REG(Relu6Grad)

 /**
 *@brief Calculate the elu_grad_v2 function. 
 *Applies the element-wise function:
 * Computes the backward for the elu: if x>0, 1; otherwise elu() + alpha .
 *@par Inputs:
 *One inputs, including:
 * @li grads: A tensor. Must be one of the following types:
 *     float16, float32. 
 * @li activations: A tensor. Must be one of the following types:
 *     float16, float32. 
 *
 *@par Outputs:
 *y: A Tensor with the same type and shape of grads's.
 * 
 *@par Attributes:
 *@li alpha: scalar parameter, default value = 1.0
 */	
 REG_OP(EluGradV2)
    .INPUT(grads, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(activations, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(alpha, Float, 1.0)
    .OP_END_FACTORY_REG(EluGradV2)
 /**
 * @brief Compute sigmoid of "x" element-wise . \n

@@ -508,6 +530,42 @@ REG_OP(Elu)
    .ATTR(alpha, Float, 1.0)
    .OP_END_FACTORY_REG(Elu)

 /**
 *@brief Continuously Differentiable Exponential Linear Uints:
 *       Perform the linear uint element-wise on the input tensor X using formula:
 *       max(0, x) + min(0, alpha * (exp(x/alpha) - 1)). \n

 *@par Inputs:
 *x: A float16, float32, for the input data type . \n

 *@par Attributes:
 *alpha1: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n

 *@par Attributes:
 *alpha2: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n

 *@par Attributes:
 *alpha3: A float32. Defines at which positive value the ELU saturates. Defaults to "1.0" . \n

 *@par Outputs:
 *y: A float16, float32, for the normalized result . \n

 *@attention Constraints:
 *@li The input is of type float16 or float32 . \n

 *@par Multiple batches supported or not
 *Supported
 *@par Third-party framework compatibility
 *@li Compatible with ONNX's Celu operator
 */
 REG_OP(Celu)
    .INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT,DT_FLOAT16}))
    .ATTR(alpha1, Float, 1.0)
    .ATTR(alpha2, Float, 1.0)
    .ATTR(alpha3, Float, 1.0)
    .OP_END_FACTORY_REG(Celu)

 /**
 *@brief Computes gradients for the exponential linear (Elu) operation.
 *
@@ -640,6 +698,352 @@ REG_OP(Mish)
    .OUTPUT(y, TensorType({ DT_FLOAT,DT_FLOAT16 }))
    .OP_END_FACTORY_REG(Mish)

 /**
 * @brief: pytorch mish_grad operator.
 * @par Inputs:
 * three input, including:
 * @li grad: A Tensor. shape, datatype and format is same as x
 * @li x: A Tensor. Must be one of the following types: float16, float32
 * @li tanhx: A Tensor. shape, datatype and format is same as x
 * @par Outputs:
 * 1 output, including:
 * @li x_grad: A Tensor. shape, datatype and format is same as x
 */

 REG_OP(MishGrad)
    .INPUT(grad, TensorType({ DT_FLOAT,DT_FLOAT16 }))
    .INPUT(x, TensorType({ DT_FLOAT,DT_FLOAT16 }))
    .OPTIONAL_INPUT(tanhx, TensorType({ DT_FLOAT,DT_FLOAT16 }))
    .OUTPUT(x_grad, TensorType({ DT_FLOAT,DT_FLOAT16 }))
    .OP_END_FACTORY_REG(MishGrad)

 /**
 * @brief pytorch hardtanh_backward operator.
 *
 * @par Inputs:
 * 2 inputs, including:
 * @li result, minimum tensor of the linear region range,
 * datatype: float16/float32, format:ND/5HD.
 * @li grad, maximum tensor of the linear region range,
 * datatype:float16/float32, format:ND/5HD. \n

 * @par Attributes:
 * 2 attributes, including:
 * @li min_val, minimum value of the linear region range, datatype:float.
 * @li max_val, maximum value of the linear region range, datatype:float. \n

 * @par Outputs:
 * 1 output, including:
 * @li y, hardtanh_backward output tensor, datatype and format is same as
 * input result. \n

 * @attention Constraints:
 * This operator only supports dataType: float16/float32, format: ND/5HD. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator HardtanhGrad.
 */
 REG_OP(HardtanhGrad)
    .INPUT(result, TensorType({ DT_FLOAT16, DT_FLOAT })) /* "First operand." */
    .INPUT(grad, TensorType({ DT_FLOAT16, DT_FLOAT }))   /* "Second operand." */
    .OUTPUT(y, TensorType({ DT_FLOAT16, DT_FLOAT }))     /* "Result, has same element type as two inputs" */
    .ATTR(min_val, Float, -1.0)
    .ATTR(max_val, Float, 1.0)
    .OP_END_FACTORY_REG(HardtanhGrad)

 /**
 * @brief Calculates the softplus loss function with attributes of beta and threshold. \n

 * @par Inputs:
 * One inputs, including:
 * @li x: A mutable Tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * @li beta: An optional float. Defaults to "1.0" \n

 * @li threshold: An optional float. Defaults to "20.0" \n

 * @par Outputs:
 * @li y: A mutable Tensor. Has the same type as "x" \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Softplus.
 */
 REG_OP(SoftplusV2)
    .INPUT(x, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .OUTPUT(y, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .ATTR(beta, Float, 1.0)
    .ATTR(threshold, Float, 20.0)
    .OP_END_FACTORY_REG(SoftplusV2)

 /**
 * @brief Calculates the reversed outputs of the function "softplus_v2". \n

 * @par Inputs:
 * Two inputs, including:
 * @li input_gradients: A mutable Tensor. Must be one of the following types:
 *     float16, float32.
 * @li input_features: A mutable Tensor of the same type as "input_gradients" \n

 * @par Attributes:
 * @li beta: An optional float. Defaults to "1.0" \n

 * @li threshold: An optional float. Defaults to "20.0" \n

 * @par Outputs:
 * @li output_backprops: A mutable Tensor. Has the same type as "input_gradients" \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator SoftplusGrad.
 */
 REG_OP(SoftplusV2Grad)
    .INPUT(input_gradients, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .INPUT(input_features, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .OUTPUT(output_backprops, TensorType({ DT_FLOAT, DT_FLOAT16 }))
    .ATTR(beta, Float, 1.0)
    .ATTR(threshold, Float, 20.0)
    .OP_END_FACTORY_REG(SoftplusV2Grad)

 /**
 * @brief ThresholdedRelu takes one input data (Tensor) and produces one output data (Tensor)
 *  where the rectified linear function, y = x for x > alpha, y = 0 otherwise, is applied to the tensor elementwise.
 * 
 * @par inputs
 * one input including:
 * @li x: input A Tensor. Must be one of the following types: float32, float16
 * 
 * @par output
 * one output including:
 * @li y:A Tensor of the same type as x
 * 
 */
 REG_OP(ThresholdedRelu)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(alpha, Float, 1.0)
    .OP_END_FACTORY_REG(ThresholdedRelu)

 /**
 * @brief Calculate the hard shrinkage function. \n

 * @par Inputs:
 * One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * @li lambd: An optional float. Defaults to 0.5. \n

 * @par Outputs:
 * y: A Tensor with the same dtype and shape of input_x's. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Hardshrink. \n
 */
 REG_OP(HardShrink)
    .INPUT(input_x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(lambd, Float, 0.5)
    .OP_END_FACTORY_REG(HardShrink)

 /**
 *@brief Calculate the hard shrink grad function. \n
 *
 * Computes the gradient for the HardShrink: if x > lambda or x < -lambda, x,otherwise 0
 *
 *@par Inputs:
 *Two inputs, including:
 * @li gradients: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * @li features: A tensor. Must be one of the following types:
 *     float16, float32. \n
 *
 *@par Outputs:
 *backprops: A Tensor with the same type and shape of features's. \n
 *
 *@par Attributes:
 *@li lambd: An optional float.Defaults to 0.5. \n
 *
 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Hardshrink_backward. \n
 */
  REG_OP(HardShrinkGrad)
  .INPUT(gradients, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(backprops, TensorType({DT_FLOAT16, DT_FLOAT}))
  .ATTR(lambd, Float, 0.5)
  .OP_END_FACTORY_REG(HardShrinkGrad)

 /**
 * @brief Calculate the hard sigmoid function. \n

 * @par Inputs:
 * One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32, int32. \n

 * @par Attributes:
 * @li alpha: An optional float. Defaults to 0.16666666. \n
 * @li beta: An optional float. Defaults to 0.5. \n

 * @par Outputs:
 * y: A Tensor with the same dtype and shape of input_x's. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Hardsigmoid. \n
 */    
 REG_OP(HardSigmoid)
    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .OUTPUT(output_y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(alpha, Float, 0.16666666)
    .ATTR(beta, Float, 0.5)
    .OP_END_FACTORY_REG(HardSigmoid)

 /**
 * @brief Calculate the soft shrinkage function. \n

 * @par Inputs:
 * One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * @li lambd: An optional float. Defaults to 0.5. \n

 * @par Outputs:
 * y: A Tensor with the same dtype and shape of input_x's. \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Softshrink. \n
 */
 REG_OP(SoftShrink)
     .INPUT(input_x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(output_y, TensorType({DT_FLOAT16, DT_FLOAT}))
     .ATTR(lambd, Float, 0.5)
     .OP_END_FACTORY_REG(SoftShrink)

 /**
 * @brief Calculate the reversed outputs of the function "soft_shrink". \n

 * @par Inputs:
 * Two inputs, including:
 * @li input_grad: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * @li input_x: A tensor of the same dtype as "input_grad". \n

 * @par Attributes:
 * @li lambd: An optional float. Defaults to 0.5. \n

 * @par Outputs:
 * y: A Tensor of the same dtype and shape as "input_graxd". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator SoftShrinkGrad. \n
 */
 REG_OP(SoftShrinkGrad)
     .INPUT(input_grad, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(input_x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(output_y, TensorType({DT_FLOAT16, DT_FLOAT}))
     .ATTR(lambd, Float, 0.5)
     .OP_END_FACTORY_REG(SoftShrinkGrad)

 /**
 *@brief Calculate the gradient of log simoid. \n

 *@par Inputs:
 *Two inputs, including:
 * @li grads: A tensor, gradient of previous layer. Must be one of the following types:
 *       float16, float32. \n
 * @li features: A tensor, input of log sigmoid. Must be one of the following types:
 *       float16, float32. \n

 *@par Outputs:
 *One outputs, including:
 * @li backprops: A tensor with the same type of and shape of grads. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator LogSigmoidBackward. \n
 */
 REG_OP(LogSigmoidGrad)
    .INPUT(grads, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(features, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(backprops, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(LogSigmoidGrad)

 /**
 *@brief Calculate -ln(1+e^(-x)). \n

 *@par Inputs:
 *One inputs, including:
 * @li x: A tensor. Must be one of the following types:
 *       float16, float32. \n

 *@par Outputs:
 *One outputs, including:
 * @li y: A tensor with the same type and shape of x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator LogSigmoid. \n
 */
 REG_OP(LogSigmoid)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) /* "input:x" */
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))  /* "output:y" */
    .OP_END_FACTORY_REG(LogSigmoid)

 /**
 *@brief Calculate the backward outputs of the function "hard_sigmoid" \n

 *@par Inputs:
 *One inputs, including:
 * @li grads: A tensor. Must be one of the following types:
 *       float16, float32. \n
 * @li input_x: A tensor. Must be one of the following types:
 *       float16, float32. \n

 *@par Outputs:
 *One outputs, including:
 * @li y: A tensor with the same type and shape of x's. \n

 * @par Attributes:
 * @li alpha: An optional float. Defaults to 0.16666666. \n
 * @li beta: An optional float. Defaults to 0.5. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator LogSigmoidGrad. \n
 */
 REG_OP(HardSigmoidGrad)
    .INPUT(grads, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(alpha, Float, 0.16666666)
    .ATTR(beta, Float, 0.5)
    .OP_END_FACTORY_REG(HardSigmoidGrad)

 /**
 * @brief Calculate the shrink function. \n

 * @par Inputs:
 * One inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * @li lambd: An optional float. Defaults to 0.5. \n
 * @li bias: An optional float. Defaults to 0.0. \n

 * @par Outputs:
 * y: A Tensor with the same dtype and shape of input_x's. \n

 * @par Third-party framework compatibility
 * Compatible with the ONNX operator Shrink. \n
 */
 REG_OP(Shrink)
    .INPUT(input_x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(lambd, Float, 0.5)
    .ATTR(bias, Float, 0.0)
    .OP_END_FACTORY_REG(Shrink)
 } // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NONLINEAR_FUC_OPS_H_
--- a/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
+++ b/third_party/fwkacllib/inc/ops/npu_loss_scale_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/outfeed_ops.h
+++ b/third_party/fwkacllib/inc/ops/outfeed_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,7 +101,7 @@ REG_OP(FillD)
 */
 REG_OP(BroadcastTo)
    .INPUT(x, TensorType::BasicType())
    .INPUT(shape, TensorType({DT_INT32}))
    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(BroadcastTo)

@@ -161,7 +161,7 @@ REG_OP(Pad)
 *@brief Pads a tensor . \n

 *@par Inputs:
 *x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32 . \n
 *x: A Tensor. Must be one of the following types: float16, float32, int32 . \n

 *@par Attributes:
 *paddings: An optional "vector<vector<int>>". Defaults to "{}".
@@ -180,8 +180,8 @@ REG_OP(Pad)
 * Warning: THIS FUNCTION IS DEPRECATED. Please use Pad instead.
 */
 REG_OP(PadD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .REQUIRED_ATTR(paddings, ListListInt)
    .OP_END_FACTORY_REG(PadD)

@@ -213,7 +213,7 @@ REG_OP(PadV2)
 *@brief Pads a tensor . \n

 *@par Inputs:
 *x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32 . \n
 *x: A Tensor. Must be one of the following types: float16, float32, int32 . \n
 *constant_values: A Tensor. Must have the same type as input.

 *@par Attributes:
@@ -227,10 +227,7 @@ REG_OP(PadV2)
 *y: A Tensor of the same type as "x" . \n

 *@par Third-party framework compatibility:
 * Compatible with TensorFlow operator Pad.
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use Pad instead.
 * Compatible with TensorFlow operator PadV2.
 */
 REG_OP(PadV2D)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
@@ -272,42 +269,42 @@ REG_OP(PadV3)
    .ATTR(paddings_contiguous, Bool, true)
    .OP_END_FACTORY_REG(PadV3)

 /**
 *@brief Pads a tensor.

 *@par Inputs:
 *x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32.

 *@par Attributes:
 * @li paddings: An required "vector<vector<int>>".
 *     For each dimension D of input, paddings[D, 0] indicates how many
 *     values to add before the contents of tensor in that dimension,
 *     and paddings[D, 1] indicates how many values to add after the
 *     contents of tensor in that dimension.
 * @li constant_values: An optional int value for pad.
 * @li mode: An optional string, Defaults to "constant", indicates paddings mode,
 *     support "constant", "reflect", "edge"
 * @li paddings_contiguous: An optional bool value, Defaults to true.
 *     If true, paddings is arranged as [[begin0, end0], [begin1, end1], ...]
 *     If false, paddings is arranged as [[begin0, begin1], ..., [end0, end1], ...]

 *@par Outputs:
 *y: A Tensor of the same type as "x".

 *@par Third-party framework compatibility:
 * Compatible with ONNX operator Pad.

 * @par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use PadV3 instead.
 */
 REG_OP(PadV3D)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8}))
    .REQUIRED_ATTR(paddings, ListListInt)
    .ATTR(constant_values, Int, 0)
    .ATTR(mode, String, "constant")
    .ATTR(paddings_contiguous, Bool, true)
    .OP_END_FACTORY_REG(PadV3D)
  /**
  *@brief Pads a tensor.

  *@par Inputs:
  *x: A Tensor. Must be one of the following types: float16, float32, int8, uint8, int32.

  *@par Attributes:
  * @li paddings: An required "vector<vector<int>>".
  *     For each dimension D of input, paddings[D, 0] indicates how many
  *     values to add before the contents of tensor in that dimension,
  *     and paddings[D, 1] indicates how many values to add after the
  *     contents of tensor in that dimension.
  * @li constant_values: An optional int value for pad.
  * @li mode: An optional string, Defaults to "constant", indicates paddings mode,
  *     support "constant", "reflect", "edge"
  * @li paddings_contiguous: An optional bool value, Defaults to true.
  *     If true, paddings is arranged as [[begin0, end0], [begin1, end1], ...]
  *     If false, paddings is arranged as [[begin0, begin1], ..., [end0, end1], ...]

  *@par Outputs:
  *y: A Tensor of the same type as "x".

  *@par Third-party framework compatibility:
  * Compatible with ONNX operator Pad.

  * @par Restrictions:
  * Warning: THIS FUNCTION IS DEPRECATED. Please use PadV3 instead.
  */
  REG_OP(PadV3D)
      .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8}))
      .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8}))
      .REQUIRED_ATTR(paddings, ListListInt)
      .ATTR(constant_values, Int, 0)
      .ATTR(mode, String, "constant")
      .ATTR(paddings_contiguous, Bool, true)
      .OP_END_FACTORY_REG(PadV3D)

 /**
 *@brief Create a diagonal tensor
@@ -403,5 +400,76 @@ REG_OP(EmbeddingRankId)
    .ATTR(mode, String, "mod")
    .OP_END_FACTORY_REG(EmbeddingRankId)

 /**
 *@brief EmbeddingLocalIndex, Sort statistics index according to rank_id \n

 *@par Inputs:
 * @li addr_table: A 2D tensor which last dimension must be 3.
 * @li index: A tensor with data type int32, int64, uint32, uint64.

 *@par Attributes:
 * @li row_memory: The size of Embedding vector in a row, the default is 320.
 * @li mode: String type, currently there are two options: 'mod' and 'order'

 *@par Outputs:
 * @li local_idx:Index on each server.
 * @li nums:The number of local_idx found on each server.
 * @li recover_idx:The sorted local_idx element is at the position corresponding
 * to the original input index.

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Diag.
 */
 REG_OP(EmbeddingLocalIndex)
    .INPUT(addr_table, TensorType({DT_UINT64}))
    .INPUT(index, TensorType({DT_INT64,DT_INT32,DT_UINT32,DT_UINT64}))
    .OUTPUT(local_idx, TensorType({DT_INT64,DT_INT32,DT_UINT32,DT_UINT64}))
    .OUTPUT(nums, TensorType({DT_INT64,DT_INT32,DT_UINT32,DT_UINT64}))
    .OUTPUT(recover_idx, TensorType({DT_INT64,DT_INT32,DT_UINT32,DT_UINT64}))
    .ATTR(row_memory, Int, 320)
    .ATTR(mode, String, "mod")
    .OP_END_FACTORY_REG(EmbeddingLocalIndex)

 /**
 * @brief Fill the value to a tensor has the specified shape.

 * @par Inputs:
 * One inputs, including:
 * @li dims: An Tensor, specify the shape that the value to fill.

 * @par Attributes:
 * @li value: An optional float value. Defaults to 0.0.

 * @par Outputs:
 * @li y: A Tensor. Has the shape specify by attr shape, and full of the value specify by attr value.

 * @par Third-party framework compatibility
 * Compatible with the ONNX operator ConstantOfShape.
 */
 REG_OP(FillV2)
    .INPUT(dims, TensorType({DT_INT16, DT_INT32, DT_INT64}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64}))
    .ATTR(value, Float, 0)
    .OP_END_FACTORY_REG(FillV2)

 /**
 * @brief Fill the value to a tensor has the specified shape.

 * @par Attributes:
 * @li value: An optional float value. Defaults to 0.0.

 * @li dims: An required listInt to specify the shape that the value to fill.

 * @par Outputs:
 * @li y: A Tensor. Has the shape specify by attr shape, and full of the value specify by attr value.

 * @par Third-party framework compatibility
 * Compatible with the ONNX operator ConstantOfShape.
 */
 REG_OP(FillV2D)
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64}))
    .ATTR(value, Float, 0)
    .REQUIRED_ATTR(dims, ListInt)
    .OP_END_FACTORY_REG(FillV2D)
 } // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_PAD_OPS_H_
--- a/third_party/fwkacllib/inc/ops/parsing_ops.h
+++ b/third_party/fwkacllib/inc/ops/parsing_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,6 +51,246 @@ REG_OP(StringToNumber)
    .ATTR(out_type, Type, DT_FLOAT)
    .OP_END_FACTORY_REG(StringToNumber)

 /**
 *@brief Convert serialized tensorflow.TensorProto prototype to Tensor.
 *@brief Parse an Example prototype. 
 *@par Input:
 *serialized: A Tensor of type string.
 *dense_defaults:  DYNAMIC INPUT Tensor type as string, float, int64. \n

 *@par Attributes:
 *num_sparse: type int num of inputs sparse_indices , sparse_values, sparse_shapes
 *out_type: output type
 *sparse_keys: ListString
 *sparse_types: types of sparse_values
 *dense_keys: ListString
 *dense_shapes: output of dense_defaults shape
 *dense_types: output of dense_defaults type  \n

 *@par Outputs:
 *sparse_indices: A Tensor of type string. 
 *sparse_values:  Has the same type as sparse_types.
 *sparse_shapes: A Tensor of type int64
 *dense_values:  Has the same type as dense_defaults.

 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 **/
 REG_OP(ParseSingleExample)
    .INPUT(serialized, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(dense_defaults, TensorType({DT_STRING,DT_FLOAT,DT_INT64}))
    .DYNAMIC_OUTPUT(sparse_indices, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(sparse_values, TensorType({DT_STRING,DT_FLOAT,DT_INT64}))
    .DYNAMIC_OUTPUT(sparse_shapes, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(dense_values, TensorType({DT_STRING,DT_FLOAT,DT_INT64}))
    .ATTR(num_sparse, Int, 0)
    .ATTR(sparse_keys, ListString, {})
    .ATTR(dense_keys, ListString, {})
    .ATTR(sparse_types, ListType, {})
    .ATTR(Tdense, ListType, {})
    .ATTR(dense_shapes, ListListInt, {})
    .OP_END_FACTORY_REG(ParseSingleExample)

 /**
 *@brief Decodes raw file into  tensor . \n
 *@par Input:
 *bytes: A Tensor of type string.

 *@par Attributes:
 *little_endian: bool ture
 *out_type: output type

 *@par Outputs:
 *Output: A Tensor
 **/
 REG_OP(DecodeRaw)
    .INPUT(bytes, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_BOOL,DT_FLOAT16,DT_DOUBLE,DT_FLOAT,
                                    DT_INT64,DT_INT32,DT_INT8,DT_UINT8,DT_INT16,
                                    DT_UINT16,DT_COMPLEX64,DT_COMPLEX128}))
    .ATTR(out_type, Type, DT_FLOAT)
    .ATTR(little_endian, Bool, true)
    .OP_END_FACTORY_REG(DecodeRaw)

 /**
 *@brief Convert serialized tensorflow.TensorProto prototype to Tensor. \n

 *@par Inputs:
 *serialized: A Tensor of string type. Scalar string containing serialized
 *TensorProto prototype. \n

 *@par Attributes:
 *out_type: The type of the serialized tensor. The provided type must match the
 *type of the serialized tensor and no implicit conversion will take place. \n

 *@par Outputs:
 *output: A Tensor of type out_type. \n

 *@attention Constraints:
 *The implementation for StringToNumber on Ascend uses AICPU,
 *with badperformance. \n

 *@par Third-party framework compatibility
 *@li compatible with tensorflow ParseTensor operator.
 */
 REG_OP(ParseTensor)
    .INPUT(serialized, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType(DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16,
                          DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_UINT32,
                          DT_UINT64, DT_BOOL, DT_DOUBLE, DT_STRING,
                          DT_COMPLEX64, DT_COMPLEX128}))
    .ATTR(out_type, Type, DT_FLOAT)
    .OP_END_FACTORY_REG(ParseTensor)

 /**
 *@brief Converts each string in the input Tensor to the specified numeric
 *type . \n

 *@par Inputs:
 *Inputs include:
 *records: Each string is a record/row in the csv and all records should have the
 *same format. \n
 *record_defaults: One tensor per column of the input record, with either a
 *scalar default value for that column or an empty vector if the column is
 *required. \n

 *@par Attributes:
 *OUT_TYPE: The numeric type to interpret each string in string_tensor as . \n
 *field_delim: char delimiter to separate fields in a record. \n
 *use_quote_delim: If false, treats double quotation marks as regular characters
 *inside of the string fields (ignoring RFC 4180, Section 2, Bullet 5). \n
 *na_value: Additional string to recognize as NA/NaN. \n

 *@par Outputs:
 *output: A Tensor. Has the same type as x . \n

 *@attention Constraints:
 *The implementation for StringToNumber on Ascend uses AICPU, with bad
 *performance. \n

 *@par Third-party framework compatibility
 *@li compatible with tensorflow StringToNumber operator.
 */
 REG_OP(DecodeCSV)
    .INPUT(records, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(record_defaults, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32,
                                        DT_INT64, DT_STRING}))
    .DYNAMIC_OUTPUT(output, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32,
                                        DT_INT64, DT_STRING}))
    .ATTR(OUT_TYPE, ListType, {})
    .ATTR(field_delim, String, ",")
    .ATTR(use_quote_delim, Bool, true)
    .ATTR(na_value, String, ",")
    .ATTR(select_cols, ListInt, {})
    .OP_END_FACTORY_REG(DecodeCSV)

 /**
 *@brief Convert serialized tensorflow.TensorProto prototype to Tensor.
 *@brief Parse an Example prototype.
 *@par Input:
 *serialized: A Tensor of type string. \n
 *name:A Tensor of type string. \n
 *sparse_keys: Dynamic input tensor of string. \n
 *dense_keys: Dynamic input tensor of string \n
 *dense_defaults:  Dynamic input tensor type as string, float, int64. \n

 *@par Attributes:
 *Nsparse: Number of sparse_keys, sparse_indices and sparse_shapes \n
 *Ndense: Number of dense_keys \n
 *sparse_types: types of sparse_values \n
 *Tdense: Type of dense_defaults dense_defaults and dense_values \n
 *dense_shapes: output of dense_defaults shape  \n

 *@par Outputs:
 *sparse_indices: A Tensor of type string. \n
 *sparse_values:  Has the same type as sparse_types. \n
 *sparse_shapes: A Tensor of type int64 \n
 *dense_values:  Has the same type as dense_defaults. \n
 *@par Third-party framework compatibility \n
 *@li compatible with tensorflow StringToNumber operator. \n
 */
 REG_OP(ParseExample)
    .INPUT(serialized, TensorType({DT_STRING}))
    .INPUT(name, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(sparse_keys, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(dense_keys, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(dense_defaults, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .DYNAMIC_OUTPUT(sparse_indices, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(sparse_values, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .DYNAMIC_OUTPUT(sparse_shapes, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(dense_values, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .ATTR(Nsparse, Int, 0)
    .ATTR(Ndense, Int, 0)
    .ATTR(sparse_types, ListType, {})
    .ATTR(Tdense, ListType, {})
    .ATTR(dense_shapes, ListListInt, {})
    .OP_END_FACTORY_REG(ParseExample)

 /**
 *@brief Transforms a scalar brain.SequenceExample proto (as strings) into typed
 *tensors.
 *@par Input:
 *serialized: A Tensor of type string. \n
 *feature_list_dense_missing_assumed_empty:A Tensor of type string. \n
 *context_sparse_keys: Dynamic input tensor of string. \n
 *context_dense_keys: Dynamic input tensor of string \n
 *feature_list_sparse_keys:  Dynamic input tensor of string \n
 *feature_list_dense_keys:  Dynamic input tensor of string \n
 *context_dense_defaults:  Dynamic input tensor of string, float, int64 \n
 *debug_name: A Tensor of type string. \n

 *@par Attributes:
 *Ncontext_sparse: Number of context_sparse_keys, context_sparse_indices and context_sparse_shapes \n
 *Ncontext_dense: Number of context_dense_keys \n
 *Nfeature_list_sparse: Number of feature_list_sparse_keys \n
 *Nfeature_list_dense: Number of feature_list_dense_keys \n
 *context_sparse_types: Types of context_sparse_values \n
 *Tcontext_dense: Number of dense_keys \n
 *feature_list_dense_types: Types of feature_list_dense_values \n
 *context_dense_shapes: Shape of context_dense \n
 *feature_list_sparse_types: Type of feature_list_sparse_values \n
 *feature_list_dense_shapes: Shape of feature_list_dense \n

 *@par Outputs:
 *context_sparse_indices: Dynamic output tensor of type int64. \n
 *context_sparse_values:  Dynamic output tensor of type string, float, int64. \n
 *context_sparse_shapes: Dynamic output tensor of type int64 \n
 *context_dense_values:  Dynamic output tensor of type string, float, int64. \n
 *feature_list_sparse_indices: Dynamic output tensor of type int64. \n
 *feature_list_sparse_values:  Dynamic output tensor of type string, float, int64. \n
 *feature_list_sparse_shapes: Dynamic output tensor of type int64 \n
 *feature_list_dense_values:  Dynamic output tensor of type string, float, int64. \n
 *@par Third-party framework compatibility \n
 *@li compatible with tensorflow StringToNumber operator. \n
 */
 REG_OP(ParseSingleSequenceExample)
    .INPUT(serialized, TensorType({DT_STRING}))
    .INPUT(feature_list_dense_missing_assumed_empty, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(context_sparse_keys, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(context_dense_keys, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(feature_list_sparse_keys, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(feature_list_dense_keys, TensorType({DT_STRING}))
    .DYNAMIC_INPUT(context_dense_defaults, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .INPUT(debug_name, TensorType({DT_STRING}))
    .DYNAMIC_OUTPUT(context_sparse_indices, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(context_sparse_values, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .DYNAMIC_OUTPUT(context_sparse_shapes, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(context_dense_values, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .DYNAMIC_OUTPUT(feature_list_sparse_indices, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(feature_list_sparse_values, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .DYNAMIC_OUTPUT(feature_list_sparse_shapes, TensorType({DT_INT64}))
    .DYNAMIC_OUTPUT(feature_list_dense_values, TensorType({DT_FLOAT, DT_INT64, DT_STRING}))
    .ATTR(Ncontext_sparse, Int, 0)
    .ATTR(Ncontext_dense, Int, 0)
    .ATTR(Nfeature_list_sparse, Int, 0)
    .ATTR(Nfeature_list_dense, Int, 0)
    .ATTR(context_sparse_types, ListType, {})
    .ATTR(Tcontext_dense, ListType, {})
    .ATTR(feature_list_dense_types, ListType, {})
    .ATTR(context_dense_shapes, ListListInt, {})
    .ATTR(feature_list_sparse_types, ListType, {})
    .ATTR(feature_list_dense_shapes, ListListInt, {})
    .OP_END_FACTORY_REG(ParseSingleSequenceExample)

 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_PARSING_OPS_H_
--- a/third_party/fwkacllib/inc/ops/quantize_ops.h
+++ b/third_party/fwkacllib/inc/ops/quantize_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -60,6 +60,26 @@ REG_OP(Dequantize)
    .ATTR(mode, String, "MIN_COMBINED")
    .OP_END_FACTORY_REG(Dequantize)

 /**
 *@brief Quantizes the input . \n
 *@par Inputs:
 *x:  shape and dtype of input_x. \n
 *scales: shape and dtype of input_scales. \n
 *zero_points: shape and dtype of input_zero_points \n
 *@par Attributes:
 *@li axis: the processed dim. \n
 *@par Outputs:
 *y: shape and dtype of output_y, should be same shape as input, dtype is same as the quantified type . \n
 */
 REG_OP(Quantize)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(scales, TensorType({DT_FLOAT}))
    .INPUT(zero_points, TensorType({DT_INT8,DT_UINT8,DT_INT32}))
    .OUTPUT(y, TensorType({DT_INT8,DT_UINT8,DT_INT32}))
    .REQUIRED_ATTR(dtype, String)
    .ATTR(axis, Int, 1)
    .OP_END_FACTORY_REG(Quantize)

 /**
 *@brief Quantizes the input . \n

@@ -194,7 +214,7 @@ REG_OP(AscendRequant)
 *@brief Requantizes the input of int16 . \n

 *@par Inputs:
 *@li x: An NC1HWC0 tensor of type int16, specifying the input.
 *@li x0: An NC1HWC0 tensor of type int16, specifying the input.
 *@li req_scale: An NC1HWC0 tensor of type uint64, specifying the scaling ratio.
 *@li x1: An NC1HWC0 tensor of type int16 . \n

@@ -203,22 +223,21 @@ REG_OP(AscendRequant)
 *@li relu_flag: A optional bool, specifying whether to perform ReLU, either "True" or "False". Defaults to "False" . \n

 *@par Outputs:
 *@li y: The dequantized output tensor of type int8 and with format NC1HWC0.
 *@li y0: The dequantized output tensor of type int8 and with format NC1HWC0.
 *@li y1: The dequantized output tensor of type int16 and with format NC1HWC0 . \n

 *@par Third-party framework compatibility
 * It is a custom operator. It has no corresponding operator in Caffe.
 */
 REG_OP(AscendRequantS16)
  .INPUT(x, TensorType({DT_INT16}))
  .INPUT(x0, TensorType({DT_INT16}))
  .INPUT(req_scale, TensorType({DT_UINT64}))
  .OPTIONAL_INPUT(x1, TensorType({DT_INT16}))
  .OUTPUT(y, TensorType({DT_INT8}))
  .OUTPUT(y0, TensorType({DT_INT8}))
  .OUTPUT(y1, TensorType({DT_INT16}))
  .ATTR(dual_output, Bool, false)
  .ATTR(relu_flag, Bool, false)
  .OP_END_FACTORY_REG(AscendRequantS16)

 } // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_QUANTIZE_OPS_H_
--- a/third_party/fwkacllib/inc/ops/ragged_array_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/ragged_math_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/random_ops.h
+++ b/third_party/fwkacllib/inc/ops/random_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -356,6 +356,39 @@ REG_OP(DropOutGenMask)
    .ATTR(seed2, Int, 0)
    .OP_END_FACTORY_REG(DropOutGenMask)


 /**
 *@brief Generate random uint8 mask for dropout v3 . \n

 *@par Inputs:
 include:
 *@li shape:The shape of the output tensor.
 *@li prob:0-D. Prob of 1 . \n

 *@par Attributes:
 *@li seed:If either seed or seed2 are set to be non-zero, the random number
 *generator is seeded by the given seed. Otherwise, it is seeded by a random seed.
 *@li seed2:A second seed to avoid seed collision . \n

 *@par Outputs:
 *y:Output (1-D) random number using uint8 data format . \n

 *@attention Constraints:
 *The output is aligned with 16

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.

 *@see DropOutGenMaskV3()
 */
 REG_OP(DropOutGenMaskV3)
    .INPUT(shape, TensorType({ DT_INT32, DT_INT64 }))
    .INPUT(prob, TensorType({ DT_FLOAT16, DT_FLOAT }))
    .OUTPUT(y, TensorType({ DT_UINT8 }))
    .ATTR(seed, Int, 0)
    .ATTR(seed2, Int, 0)
    .OP_END_FACTORY_REG(DropOutGenMaskV3)

 /**
 *@brief Generates values in an interval . \n

@@ -495,6 +528,62 @@ REG_OP(ShuffleChannel)
                           DT_UINT16, DT_INT32, DT_UINT32,DT_INT64,DT_UINT64}))
    .ATTR(group, Int, 1)
    .OP_END_FACTORY_REG(ShuffleChannel)

 /**
 * @briefGenerate a tensor of samples from a multinomial 
 * distribution according to the probabilities of each of 
 * the possible outcomes.
 * 
 * @par inputs
 * one input including:
 * @li x:Input tensor with shape [batch_size, class_size], 
 * where class_size is the number of all possible outcomes.
 * Each value along the axis zero represents the unnormalized 
 * log-probability of each corresponding outcome in a batch.
 * 
 * @par output
 * one output including:
 * @li y:Output tensor with shape [batch_size, sample_size], 
 * where sample_size is the number of times to sample. 
 * Each value along the axis zero represents the outcome of 
 * the corresponding sample in a batch.
 * 
 * @par Restrictions:
 * Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(MultinomialFuss)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64}))
    .OUTPUT(y, TensorType({DT_INT32, DT_INT64}))
    .ATTR(dtype, Int, 6)
    .ATTR(sample_size, Int, 1)
    .ATTR(seed, Float, 0)
    .OP_END_FACTORY_REG(MultinomialFuss)

 /**
 * @brief During training, randomly zeroes some of the elements of the input tensor
 * with probability
 *
 * @par Inputs:
 * @li x: A ND Tensor. Must be one of the following data types: Float, Float16
 * @li seed: A ND Tensor. Must be one of the following data types: Float
 *
 * @par Attributes:
 * @li p: probability of an element to be zeroed
 *
 * @par Outputs:
 * @li y: A tensor with the same shape and type as "x".
 * @li mask: A tensor with the same shape and type as "x".
 * @li new_seed: A tensor with the same shape and type as "seed".
 */

 REG_OP(DropoutV2)
    .INPUT(x, TensorType({ DT_FLOAT16, DT_FLOAT }))
    .INPUT(seed, TensorType({ DT_FLOAT }))
    .OUTPUT(y, TensorType({ DT_FLOAT16, DT_FLOAT }))
    .OUTPUT(mask, TensorType({ DT_FLOAT }))
    .OUTPUT(seed, TensorType({ DT_FLOAT }))
    .REQUIRED_ATTR(p, Float)
    .OP_END_FACTORY_REG(DropoutV2)
 }   // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_RANDOM_OPS_H_
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace ge {
 *@attention Constraints:
 * This operator is a BatchNorm fusion operator for updating the moving
 * averages for training.
 * This operator is used in conjunction with BNTrainingUpdate.
 * This operator is used in conjunction with BNTrainingReduce.
 */
 REG_OP(BNTrainingReduce)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -45,6 +45,27 @@ REG_OP(BNTrainingReduce)
    .OUTPUT(square_sum, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(BNTrainingReduce)

 /**
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 *x: A 6D Tensor of type float16 or float32, with format NDC1HWC0 . \n

 *@par Outputs:
 *@li sum: A 3D Tensor of type float32 for SUM reduced "x".
 *@li square_sum: A 3D Tensor of type float32 for SUMSQ reduced "x" . \n

 *@attention Constraints:
 * This operator is a BatchNorm fusion operator for updating the moving
 * averages for training.
 * This operator is used in conjunction with BN3DTrainingReduce.
 */
 REG_OP(BN3DTrainingReduce)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(sum, TensorType({DT_FLOAT}))
    .OUTPUT(square_sum, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(BN3DTrainingReduce)

 /**
 *@brief Performs the backpropagation of BatchNorm . \n

@@ -88,6 +109,49 @@ REG_OP(BNTrainingReduceGrad)
    .ATTR(epsilon, Float, 0.0001)
    .OP_END_FACTORY_REG(BNTrainingReduceGrad)

 /**
 *@brief Performs the backpropagation of BatchNorm . \n

 *@par Inputs:
 * Seven inputs, including:
 *@li grads: A 6D Tensor of type float16 or float32, with format NDC1HWC0, for
 * the gradient.
 *@li x: A 6D Tensor of type float16 or float32, with format NDC1HWC0.
 *@li diff_scale: A 6D Tensor of type float32, with format NDC1HWC0,
 * for the mean of "x".
 *@li diff_offset: A 6D Tensor of type float32, with format NDC1HWC0,
 * for the variance of "x".
 *@li scale: A 6D Tensor of type float32, with format NDC1HWC0.
 *@li batch_mean: A 6D Tensor of type float32, with format NDC1HWC0,
 * for the mean of "x".
 *@li batch_variance: A 6D Tensor of type float32, with format NDC1HWC0,
 * for the variance of "x" . \n

 *@par Attributes:
 *epsilon: An optional float32. Defaults to "0.0001". A small float number
 * added to the variance of "x" . \n

 *@par Outputs:
 *y: A Tensor of type float16 or float32, with format NDC1HWC0, for the offset
 * of "x" . \n

 *@attention Constraints:
 * The preceding layer of this operator must be BN3DTrainingReduceGrad . \n

 *@see BN3DTrainingReduceGrad
 */
 REG_OP(BN3DTrainingReduceGrad)
    .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(diff_scale, TensorType({DT_FLOAT}))
    .INPUT(diff_offset, TensorType({DT_FLOAT}))
    .INPUT(scale, TensorType({DT_FLOAT}))
    .INPUT(batch_mean, TensorType({DT_FLOAT}))
    .INPUT(batch_variance, TensorType({DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .ATTR(epsilon, Float, 0.0001)
    .OP_END_FACTORY_REG(BN3DTrainingReduceGrad)

 /**
 *@brief Performs reduced batch normalization . \n

@@ -120,7 +184,7 @@ REG_OP(BNTrainingReduceGrad)
 *@attention Constraints:
 *@li This operator is a BatchNorm fusion operator for updating the moving
 averages for training.
 *This operator is used in conjunction with BNTrainingReduce.
 *This operator is used in conjunction with BNTrainingUpdate.
 *@li For Ascend 310, the result accuracy fails to reach 1â€° due to the square
 * root instruction.
 */
@@ -141,6 +205,59 @@ REG_OP(BNTrainingUpdate)
    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(BNTrainingUpdate)

 /**
 *@brief Performs reduced batch normalization . \n

 *@par Inputs:
 * Seven inputs, including: (NDC1HWC0 supported)
 *@li x: A 6D Tensor of type float16 or float32.
 *@li sum: A 6D Tensor of type float32 for the output of operator
 * BN3DTrainingUpdate.
 *@li square_sum: A 6D Tensor of type float32 for the output of operator
 * BN3DTrainingUpdate.
 *@li scale: A 6D Tensor of type float32, for the scaling factor.
 *@li offset: A 6D Tensor of type float32, for the scaling offset.
 *@li mean: A 6D Tensor of type float32, for the updated mean.
 *@li variance: A 6D Tensor of type float32, for the updated variance . \n

 *@par Attributes:
 *@li epsilon: A required float32, specifying the small value added to variance
 * to avoid dividing by zero.
 *@li factor: A required float32, specifying the weight for updating the mean
 * and variance . \n

 *@par Outputs:
 * Five outputs, including: (NDC1HWC0 supported)
 *@li y: A 6D Tensor of type float16 or float32, for normalized "x".
 *@li mean: A 6D Tensor of type float32, for the updated mean.
 *@li variance: A 6D Tensor of type float32, for the updated variance.
 *@li batch_mean: A 6D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 6D Tensor of type float32, for the variance of "x" . \n

 *@attention Constraints:
 *@li This operator is a BatchNorm fusion operator for updating the moving
 averages for training.
 *This operator is used in conjunction with BN3DTrainingUpdate.
 *@li For Ascend 310, the result accuracy fails to reach 1â€° due to the square
 * root instruction.
 */
 REG_OP(BN3DTrainingUpdate)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(sum, TensorType({DT_FLOAT}))
    .INPUT(square_sum, TensorType({DT_FLOAT}))
    .INPUT(scale, TensorType({DT_FLOAT}))
    .INPUT(offset, TensorType({DT_FLOAT}))
    .INPUT(mean, TensorType({DT_FLOAT}))
    .INPUT(variance, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(factor, Float)
    .REQUIRED_ATTR(epsilon, Float)
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OUTPUT(mean, TensorType({DT_FLOAT}))
    .OUTPUT(variance, TensorType({DT_FLOAT}))
    .OUTPUT(batch_mean, TensorType({DT_FLOAT}))
    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(BN3DTrainingUpdate)

 /**
 *@brief Performs batch normalization for inference . \n

@@ -284,6 +401,40 @@ REG_OP(BNTrainingUpdateGrad)
    .OUTPUT(diff_offset, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(BNTrainingUpdateGrad)

 /**
 *@brief Performs the backpropagation of BatchNorm . \n

 *@par Inputs:
 * Four inputs, including:
 *@li grads: A 6D Tensor of type float16 or float32, with format NDC1HWC0,
 * for the gradient.
 *@li x: A 6D Tensor of type float16 or float32, with format NDC1HWC0.
 *@li batch_mean: A 6D Tensor of type float32, with format NDC1HWC0,
 * for the mean of "x".
 *@li batch_variance: A 6D Tensor of type float32, with format NDC1HWC0,
 * for the variance of "x" . \n

 *@par Attributes:
 *epsilon: An optional float32. Defaults to "0.0001". A small float number
 * added to the variance of "x" . \n

 *@par Outputs:
 *@li diff_scale: A Tensor of type float32, with format NDC1HWC0,
 * for the offset of "scale".
 *@li diff_offset: A Tensor of type float32, with format NDC1HWC0,
 * for the offset of "offset" . \n

 */
 REG_OP(BN3DTrainingUpdateGrad)
    .INPUT(grads, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(batch_mean, TensorType({DT_FLOAT}))
    .INPUT(batch_variance, TensorType({DT_FLOAT}))
    .ATTR(epsilon, Float, 0.0001)
    .OUTPUT(diff_scale, TensorType({DT_FLOAT}))
    .OUTPUT(diff_offset, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(BN3DTrainingUpdateGrad)

 /**
 *@brief Performs the backpropagation of BatchNorm for inference . \n

@@ -635,8 +786,8 @@ REG_OP(ReduceMin)
 * Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMin instead.
 */
 REG_OP(ReduceMinD)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8,DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8,DT_INT32}))
    .REQUIRED_ATTR(axes, ListInt)
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ReduceMinD)
@@ -747,14 +898,14 @@ REG_OP(Reduction)
 *@brief Computes the euclidean norm of elements across dimensions of a tensor . \n

 *@par Inputs:
 *@li input_tensor: A Tensor. Must be one of the following types: float16, float32, int32.
 *@li x: A Tensor. Must be one of the following types: float16, float32, int32.
 *@li axes: A Tensor of type int8 or int32. Specifies the dimensions to reduce. Defaults to "None" . \n

 *@par Attributes:
 *keep_dims: An optional bool. If "True", reduced dimensions will be retained. Defaults to "False" . \n

 *@par Outputs:
 *output_tensor: A Tensor. Must be one of the following types: float16, float32, int32 . \n
 *y: A Tensor. Must be one of the following types: float16, float32, int32 . \n

 *@attention Constraints:
 * If "axes = None", all dimensions will be reduced. "axes" must be in the range [-rank(input_shape), rank(input_shape)) . \n
@@ -821,7 +972,7 @@ Defaults to "0.00001" . \n
 *batch_ variance: A Tensor of type float32 for the result variance . \n

 *@attention Constraints:
 *For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction.
 *For Ascend 310, the result accuracy fails to reach 0.001 due to the square root instruction.
 */
 REG_OP(INInferV2)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -839,7 +990,7 @@ REG_OP(INInferV2)
 *@brief Performs reduced instance normalization . \n

 *@par Inputs:
 *x: A Tensor of type float16 or float32, with format NC1HWC0 . \n
 *x: A Tensor of type float16 or float32. \n

 *@par Outputs:
 *@li sum: A Tensor of type float32 for SUM reduced "x".
@@ -862,19 +1013,19 @@ REG_OP(INTrainingReduceV2)
 *@par Inputs:
 * Seven inputs, including: (NC1HWC0supported)
 *@li x: A Tensor of type float16 or float32.
 *@li sum: A T [N, C1, 1, 1, C0] ensor of type float32 for the output of operator INTrainingReduceV2.
 *@li square_sum: A  [N, C1, 1, 1, C0] Tensor of type float32 for the output of operator INTrainingReduceV2.
 *@li gamma: A  [N, C1, 1, 1, C0] Tensor of type float32, for the scaling gamma.
 *@li beta: A  [N, C1, 1, 1, C0] Tensor of type float32, for the scaling beta.
 *@li mean: A  [N, C1, 1, 1, C0] Tensor of type float32, for the updated mean.
 *@li variance: A  [N, C1, 1, 1, C0] Tensor of type float32, for the updated variance . \n
 *@li sum: A Tensor of type float32 for the output of operator INTrainingReduceV2.
 *@li square_sum: A Tensor of type float32 for the output of operator INTrainingReduceV2.
 *@li gamma: A Tensor of type float32, for the scaling gamma.
 *@li beta: A Tensor of type float32, for the scaling beta.
 *@li mean: A Tensor of type float32, for the updated mean.
 *@li variance: A Tensor of type float32, for the updated variance . \n

 *@par Attributes:
 *@li momentum: A required float32, specifying the momentum to update mean and var.
 *@li epsilon: A required float32, specifying the small value added to variance to avoid dividing by zero . \n

 *@par Outputs:
 * Three outputs, including: (NC1HWC0 supported)
 * Three outputs
 *@li y: A Tensor of type float16 or float32, for normalized "x".
 *@li batch_mean: A Tensor of type float32, for the updated mean.
 *@li batch_variance: A Tensor of type float32, for the updated variance . \n
@@ -882,7 +1033,7 @@ REG_OP(INTrainingReduceV2)
 *@attention Constraints:
 *@li This operator is a InstanceNorm fusion operator for updating the moving averages for training.
 * This operator is used in conjunction with INTrainingReduceV2.
 *@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction.
 *@li For Ascend 310, the result accuracy fails to reach 1â€° due to the square root instruction.
 */
 REG_OP(INTrainingUpdateV2)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -965,7 +1116,7 @@ for the updated variance.
 *@attention Constraints:
 *@li This operator is a InstanceNorm fusion operator for updating the moving averages for training.
 * This operator is used in conjunction with GNTrainingUpdate.
 *@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction.
 *@li For Ascend 310, the result accuracy fails to reach 1â€° due to the square root instruction.
 */
 REG_OP(GNTrainingUpdate)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
@@ -982,6 +1133,98 @@ REG_OP(GNTrainingUpdate)
    .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(GNTrainingUpdate)

 /**
 *@brief Joins a string Tensor across the given dimensions. \n

 *@par Inputs:
 include:
 *@li input:A Tensor of type string. The text to be processed.
 *@li reduction_indices:A Tensor of type int. The text to be processed. 

 *@par Attributes:
 *@li keep_dims:A bool, An optional bool. Defaults to False. If True, retain reduced dimensions with length 1..
 *@li separator:string.

 *@par output:
 *@li output::A Tensor of type string..
 */
 REG_OP(ReduceJoin)
    .INPUT(input, TensorType({DT_STRING}))
    .INPUT(reduction_indices, TensorType({DT_INT32}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(keep_dims, Bool, true)
    .ATTR(separator, String, "")
    .OP_END_FACTORY_REG(ReduceJoin)

 /**
 * @brief Calculates the standard deviation and average value of Tensors.

 * @par Inputs:
 * @li x: A Tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * Three Attributes, including:
 * @li dim: An optional listint, Defaults to "None". \n

 * @li unbiased: An optional bool. Defaults to "True".
 *     If "True", Use Bessel Correction.
 *     If "False", Do not use Bessel Correction. \n

 * @li keepdim: An optional bool. Defaults to "False".
 *     If "True", Keep the original tensor dimension.
 *     If "False", Do not keep the original tensor dimension. \n

 * @par Outputs:
 * Two Outputs, including:
 * @li y1: A Tensor. Has the same type as "x".
 * @li y2: A Tensor. Has the same type as "x". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator ReduceStd.
 */
 REG_OP(ReduceStd)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y1, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y2, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(dim, ListInt, {})
    .ATTR(unbiased, Bool, true)
    .ATTR(keepdim, Bool, false)
    .OP_END_FACTORY_REG(ReduceStd)

 /**
 * @brief Calculates the standard deviation of Tensors.

 * @par Inputs:
 * include:
 * @li x: A Tensor. Must be one of the following types: float16, float32. \n
 * @li mean: A Tensor. It's the mean of X. Must be one of the following types: float16, float32. \n


 * @par Attributes:
 * Three Attributes, including:
 * @li dim: An optional listint, Defaults to "None". \n
 * @li unbiased: An optional bool. Defaults to "True".
 *     If "True", Use Bessel Correction.
 *     If "False", Do not use Bessel Correction. \n
 * @li keepdim: An optional bool. Defaults to "False".
 *     If "True", Keep the original tensor dimension.
 *     If "False", Do not keep the original tensor dimension. \n

 * @par Outputs:
 * @li y: A Tensor. It's the std of X. Has the same type as "x".

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator ReduceStdWithMean.
 */
 REG_OP(ReduceStdWithMean)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(dim, ListInt, {})
    .ATTR(unbiased, Bool, true)
    .ATTR(keepdim, Bool, false)
    .OP_END_FACTORY_REG(ReduceStdWithMean)
 } //namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
--- a/third_party/fwkacllib/inc/ops/resource_variable_ops.h
+++ b/third_party/fwkacllib/inc/ops/resource_variable_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/rnn.h
+++ b/third_party/fwkacllib/inc/ops/rnn.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@ namespace ge {
 *@li c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li w:A 4D Tensor. Must be one of the following types: float16. The format must be FRACTAL_Z.
 *@li b:A 1D Tensor. Must be one of the following types: float16. The format must be ND . \n
 *@li mask:A 1D Tensor. Must be one of the following types: uint8.

 *@par Attributes:
 *@li keep_prob:An integer identifying the keep prob in the op. Default to 1.
@@ -42,7 +43,6 @@ namespace ge {

 *@par Outputs:
 *seven outputs:
 *@li mask:A 1D Tensor. Must be one of the following types: uint8.
 *@li ct:A 4D Tensor. Must be one of the following types: float16, float32.
 *@li ht:A 4D Tensor. Must be one of the following types: float16.
 *@li it:A 4D Tensor. Must be one of the following types: float16, float32.
@@ -187,16 +187,16 @@ REG_OP(DynamicRNNGrad)
 *@brief: DynamicRNN calculation.
 *@par Inputs:
 *ten inputs:
 *@li x:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li w:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li b:A 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li seq_length:A 1D Tensor. Must be one of the following types: int32. The format must be ND.
 *@li init_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li init_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wci:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wcf:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wco:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li mask:A 1D Tensor. Must be one of the following types: uint8. The format must be ND . \n
 *@li x:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li w:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li b:A required 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li seq_length:A optional Tensor. Only Support float16 in FRACTAL_NZ and int32 in ND.
 *@li init_h:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li init_c:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wci:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wcf:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wco:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li mask:A 1D optional Tensor. Must be one of the following types: uint8. The format must be ND . \n

 *@par Attributes:
 *@li cell_type:An string identifying the cell type in the op. Default to "LSTM". Only LSTM is currently supported.
@@ -209,6 +209,7 @@ REG_OP(DynamicRNNGrad)
 *@li time_major:An bool identifying the time major in the op. Default to true.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported.
 *@li forget_bias:An float identifying the forget bias in the op. Default to 0.
 *@li gate_order:An string identifying the type of gate order in the op. Support "ijfo" and "ifjo". Default to "ijfo".
 *@li is_training:An bool identifying is training in the op. Default to true . \n

 *@par Outputs:
@@ -221,12 +222,14 @@ REG_OP(DynamicRNNGrad)
 *@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@par Third-party framework compatibility:
 * Compatible with the TF operator LSTM.
 */
 REG_OP(DynamicRNN)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(seq_length, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(seq_length, TensorType({DT_INT32, DT_FLOAT16}))
    .OPTIONAL_INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(init_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wci, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -251,9 +254,237 @@ REG_OP(DynamicRNN)
    .ATTR(time_major, Bool, true)
    .ATTR(activation, String, "tanh")
    .ATTR(forget_bias, Float, 0.0)
    .ATTR(gate_order, String, "ijfo")
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(DynamicRNN)

 /**
 *@brief: DynamicRNNV2 calculation.
 *@par Inputs:
 *ten inputs:
 *@li x:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li weight_input:A required 4D Tensor. Must be one of the following types: float16, float32.
 *The format must be FRACTAL_Z.
 *@li weight_hidden:A required 4D Tensor. Must be one of the following types: float16, float32.
 *The format must be FRACTAL_Z.
 *@li b:A required 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li seq_length:A optional 1D Tensor. Must be one of the following types: int32. The format must be ND.
 *@li init_h:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li init_c:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wci:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wcf:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wco:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li mask:A 1D optional Tensor. Must be one of the following types: uint8. The format must be ND . \n

 *@par Attributes:
 *@li cell_type:An string identifying the cell type in the op. Default to "LSTM". Only LSTM is currently supported.
 *@li direction:An string identifying the direction in the op. Default to "UNIDIRECTIONAL".
 *Only UNIDIRECTIONAL is currently supported.
 *@li cell_depth:An integer identifying the cell depth in the op. Default to 1.
 *@li use_peephole:An bool identifying if use peephole in the op. Default to false.
 *@li keep_prob:An float identifying the keep prob in the op. Default to 1.
 *@li cell_clip:An float identifying the cell clip in the op. Default to -1.
 *@li num_proj:An integer identifying the num projection in the op. Default to 0.
 *@li time_major:An bool identifying the time major in the op. Default to true.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh".
 *Only tanh is currently supported.
 *@li recurrent_activation:An string identifying the type of activation function in the op. Default to "sigmoid".
 *Supprot "sigmoid" and "hard_sigmoid". In general, set "hard_sigmoid" for TF Keras LSTM.
 *@li forget_bias:An float identifying the forget bias in the op. Default to 0.
 *@li gate_order:An string identifying the type of gate order in the op. Support "ijfo" and "ifco". Default to "ijfo".
 *Set "ijfo" for TF operator LSTM, Set "ifco" for TF Keras LSTM.
 *@li stateful: An bool identifying the type of stateful in the op. Default to fasle.Only false is currently supported.
 *@li merge_mode: An string identifying the type of merge_modein the op. Default to "concat".
 *Only "concat" is currently supported
 *@li is_training:An bool identifying is training in the op. Default to true . \n

 *@par Outputs:
 *eight outputs:
 *@li y:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *Return the last output_h.
 *@li output_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *Return the last output_c.
 *@li i:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li j:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@par Third-party framework compatibility:
 * Compatible with the TF operator LSTM or TF keras operator LSTM.
 */

 REG_OP(DynamicRNNV2)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(weight_input, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(weight_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(seq_length, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(init_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wci, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wcf, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wco, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(mask, TensorType({DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(i, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(j, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(f, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(tanhc, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(cell_type, String, "LSTM")
    .ATTR(direction, String, "UNIDIRECTIONAL")
    .ATTR(cell_depth, Int, 1)
    .ATTR(use_peephole, Bool, false)
    .ATTR(keep_prob, Float, 1.0)
    .ATTR(cell_clip, Float, -1.0)
    .ATTR(num_proj, Int, 0)
    .ATTR(time_major, Bool, true)
    .ATTR(activation, String, "tanh")
    .ATTR(recurrent_activation, String, "sigmoid")
    .ATTR(forget_bias, Float, 0.0)
    .ATTR(gate_order, String, "ijfo")
    .ATTR(stateful, Bool, false)
    .ATTR(merge_mode, String, "concat")
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(DynamicRNNV2)

 /**
 *@brief: DynamicRNNV3 calculation.
 *@par Inputs:
 *ten inputs:
 *@li x:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li w:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li b:A required 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li seq_length:A optional 1D Tensor. Must be one of the following types: int32. The format must be ND.
 *@li init_h:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li init_c:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wci:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wcf:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wco:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li mask:A 1D optional Tensor. Must be one of the following types: uint8. The format must be ND . \n
 *@li real_mask:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li project:A 4D optional Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Attributes:
 *@li cell_type:An string identifying the cell type in the op. Default to "LSTM". Only LSTM is currently supported.
 *@li direction:An string identifying the direction in the op. Default to "UNIDIRECTIONAL". Only UNIDIRECTIONAL is currently supported.
 *@li cell_depth:An integer identifying the cell depth in the op. Default to 1.
 *@li use_peephole:An bool identifying if use peephole in the op. Default to false.
 *@li keep_prob:An float identifying the keep prob in the op. Default to 1.
 *@li cell_clip:An float identifying the cell clip in the op. Default to -1.
 *@li num_proj:An integer identifying the num projection in the op. Default to 0.
 *@li time_major:An bool identifying the time major in the op. Default to true.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported.
 *@li forget_bias:An float identifying the forget bias in the op. Default to 0.
 *@li is_training:An bool identifying is training in the op. Default to true . \n

 *@par Outputs:
 *eight outputs:
 *@li y:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li i:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li j:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@par Third-party framework compatibility:
 * Compatible with the TF operator LSTM.
 */
 REG_OP(DynamicRNNV3)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(seq_length, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(init_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wci, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wcf, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wco, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(mask, TensorType({DT_UINT8}))
    .OPTIONAL_INPUT(real_mask, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(project, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(i, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(j, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(f, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(tanhc, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(cell_type, String, "LSTM")
    .ATTR(direction, String, "UNIDIRECTIONAL")
    .ATTR(cell_depth, Int, 1)
    .ATTR(use_peephole, Bool, false)
    .ATTR(keep_prob, Float, 1.0)
    .ATTR(cell_clip, Float, -1.0)
    .ATTR(num_proj, Int, 0)
    .ATTR(time_major, Bool, true)
    .ATTR(activation, String, "tanh")
    .ATTR(forget_bias, Float, 0.0)
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(DynamicRNNV3)

 /**
 *@brief: DynamicLSTMV2 calculation.
 *@par Inputs:
 *ten inputs:
 *@li x:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li w:A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li b:A required 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li cont:A required 2D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li w_xc_x_static:A optional 2D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li h0:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li c0:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li wci:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wcf:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li wco:A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li mask:A optional 1D Tensor. Must be one of the following types: uint8. The format must be ND .

 *@par Attributes:
 *@li num_output:An integer identifying the num projection in the op. Default to 0.
 *@li expose_hidden:An bool identifying the expose_hidden in the op. Default to flase.
 *@li need_output_last:An bool identifying the time major in the op. Default to true.
 *@li forget_bias:An float identifying the forget bias in the op. Default to 0.

 *@par Outputs:
 *eight outputs:
 *@li y:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li last_output_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li last_output_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@par Third-party framework compatibility:
 * Compatible with the Caffe operator LSTM.
 *@par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DynamicLSTMV2)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(cont, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(w_xc_x_static, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(h0, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(c0, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wci, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wcf, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(wco, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(mask, TensorType({DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(last_output_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(last_output_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(num_output, Int, 0)
    .ATTR(expose_hidden, Bool, false)
    .ATTR(need_output_last, Bool, false)
    .ATTR(forget_bias, Float, 0.0)
    .OP_END_FACTORY_REG(DynamicLSTMV2)

 /**
 *@brief: LSTMInputGrad calculation.
 *@par Inputs:
@@ -297,6 +528,60 @@ REG_OP(LSTMInputGrad)
    .OP_END_FACTORY_REG(LSTMInputGrad)



 /**
 *@brief: Dynamic LSTM Cell grad calculation.Calculate the gradient of gates and cell state.
 *@par Inputs:
 *twelve inputs:
 *@li init_c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li c:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dy:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dh:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dc:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li i:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li j:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li f:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li o:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li tanhct:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li mask:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li t_state:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ . \n

 *@par Attributes:
 *@li forget_bias:An integer identifying the forget bias in the op. Default to 1.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported . \n
 *@li direction:An string that marks the calculation sequence of the operator. Default to "Forward".
 *@li gate_order:An string mark the order of output 4 gate. Default to "ijfo".

 *@par Outputs:
 *two outputs:
 *@li dgate:A 4D Tensor. Must be one of the following types: float16.
 *@li dct_1:A 4D Tensor. Must be one of the following types: float16, float32.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(DynamicLSTMGradCell)
  .INPUT(init_c, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(c, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(dh, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(dc, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(i, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(j, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(f, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(o, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(tanhct, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(mask, TensorType({DT_FLOAT16, DT_FLOAT}))
  .INPUT(t_state, TensorType({DT_INT32, DT_INT32}))
  .OUTPUT(dgate, TensorType({DT_FLOAT16, DT_FLOAT}))
  .OUTPUT(dct_1, TensorType({DT_FLOAT16, DT_FLOAT}))
  .ATTR(forget_bias, Float, 1)
  .ATTR(activation, String, "")
  .ATTR(direction, String, "Forward")
  .ATTR(gate_order, String, "ijfo")
  .OP_END_FACTORY_REG(DynamicLSTMGradCell)


 /**
 *@brief: Basic LSTM Cell backward calculation.Calculate the gradient of input and hidden state.
 *@par Inputs:
@@ -475,9 +760,9 @@ REG_OP(BasicRNNCell)
    .OP_END_FACTORY_REG(BasicRNNCell)

 /**
 *@brief: DynamicGRU calculation.
 *@brief DynamicGRU calculation.
 *@par Inputs:
 *seven inputs: \n
 *seven inputs: 
 *@li x:Must be one of the following types: float16. The format must be FRACTAL_NZ.
 *@li w:Must be one of the following types: float16. The format must be FRACTAL_Z.
 *@li b:Must be one of the following types: float16, float32. The format must be ND.
@@ -497,7 +782,7 @@ REG_OP(BasicRNNCell)
 *@li is_training:An bool identifying is training in the op. Default to true.

 *@par Outputs:
 *five outputs: \n
 *five outputs: 
 *@li y:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_h:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li r:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
@@ -531,9 +816,9 @@ REG_OP(DynamicGRU)
    .OP_END_FACTORY_REG(DynamicGRU)

 /**
 *@brief: DynamicGRUV2 calculation.
 *@brief DynamicGRUV2 calculation.
 *@par Inputs:
 *seven inputs: \n
 *seven inputs: 
 *@li x:Must be one of the following types: float16. The format must be FRACTAL_NZ.
 *@li weight_input:Must be one of the following types: float16. The format must be FRACTAL_Z.
 *@li weight_hidden:Must be one of the following types: float16. The format must be FRACTAL_Z.
@@ -555,16 +840,13 @@ REG_OP(DynamicGRU)
 *@li is_training:An bool identifying is training in the op. Default to true.

 *@par Outputs:
 *six outputs: \n
 *six outputs: 
 *@li y:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_h:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li update:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li reset:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li new:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li hidden_new:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DynamicGRUV2)
    .INPUT(x, TensorType({DT_FLOAT16}))
@@ -592,6 +874,68 @@ REG_OP(DynamicGRUV2)
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(DynamicGRUV2)


 /**
 *@brief DynamicGRUV2Hidden calculation.
 *@par Inputs:
 *five inputs: 
 *@li x_weight_input:Must be one of the following types: float32. The format must be FRACTAL_NZ.
 *@li weight_hidden:Must be one of the following types: float16. The format must be FRACTAL_Z.
 *@li bias_hidden:Must be one of the following types: float16, float32. The format must be ND.
 *@li seq_length:Must be one of the following types: int32. The format must be ND.
 *@li init_h:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Attributes:
 *@li direction:An string identifying the direction in the op. Default to "UNIDIRECTIONAL". 
 Only UNIDIRECTIONAL is currently supported.
 *@li cell_depth:An integer identifying the cell depth in the op. Default to 1.
 *@li keep_prob:An float identifying the keep prob in the op. Default to 1.
 *@li cell_clip:An float identifying the cell clip in the op. Default to -1.
 *@li num_proj:An integer identifying the num projection in the op. Default to 0.
 *@li time_major:An bool identifying the time major in the op. Default to true.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh". 
 Only tanh is currently supported.
 *@li gate_order:An string identifying the gate order in weight and bias. Default to "zrh". "rzh" is another option.
 *@li reset_after:An bool identifying whether to apply reset gate after matrix multiplication. Default to true.
 *@li is_training:An bool identifying is training in the op. Default to true.

 *@par Outputs:
 *six outputs: 
 *@li y:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li output_h:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li update:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li reset:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li new:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li hidden_new:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DynamicGRUV2Hidden)
    .INPUT(x_weight_input, TensorType({DT_FLOAT32}))
    .INPUT(weight_hidden, TensorType({DT_FLOAT16}))
    .OPTIONAL_INPUT(bias_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(seq_length, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(output_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(update, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(reset, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(new, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(hidden_new, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(direction, String, "UNIDIRECTIONAL")
    .ATTR(cell_depth, Int, 1)
    .ATTR(keep_prob, Float, 1.0)
    .ATTR(cell_clip, Float, -1.0)
    .ATTR(num_proj, Int, 0)
    .ATTR(time_major, Bool, true)
    .ATTR(activation, String, "tanh")
    .ATTR(gate_order, String, "zrh")
    .ATTR(reset_after, Bool, true)
    .ATTR(is_training, Bool, true)
    .OP_END_FACTORY_REG(DynamicGRUV2Hidden)


 /**
 *@brief: DynamicGRUV2Grad calculation.
 *@par Inputs:
@@ -618,7 +962,6 @@ REG_OP(DynamicGRUV2)
 *@li cell_clip:An float identifying the cell clip in the op. Default to -1.
 *@li num_proj:An integer identifying the num projection in the op. Default to 0.
 *@li time_major:An bool identifying the time major in the op. Default to true.
 *@li bias_type:An string identifying the type of bias_type function in the op. Default to "double_bias".
 *@li gate_order:An string identifying the gate order in weight and bias. Default to "zrh". "rzh" is another option.
 *@li reset_after:An bool identifying whether to apply reset gate after matrix multiplication. Default to true.

@@ -630,6 +973,9 @@ REG_OP(DynamicGRUV2)
 *@li db_hidden:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dx:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dh_prev:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(DynamicGRUV2Grad)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -658,7 +1004,6 @@ REG_OP(DynamicGRUV2Grad)
    .ATTR(cell_clip, Float, -1.0)
    .ATTR(num_proj, Int, 0)
    .ATTR(time_major, Bool, true)
    .ATTR(bias_type, String, "double_bias")
    .ATTR(gate_order, String, "zrh")
    .ATTR(reset_after, Bool, true)
    .OP_END_FACTORY_REG(DynamicGRUV2Grad)
@@ -667,7 +1012,7 @@ REG_OP(DynamicGRUV2Grad)
 *@brief: GRUV2HiddenGrad calculation.
 *@par Inputs:
 *nine inputs: \n
 *@li weight_hidden:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dh_pre_t:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li init_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dy:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
@@ -678,6 +1023,7 @@ REG_OP(DynamicGRUV2Grad)
 *@li hidden_new:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Attributes:
 *@li t_state:An Int identifying the current t state. Default to [0, 4].
 *@li gate_order:An string identifying the gate order in weight and bias. Default to "zrh". "rzh" is another option.

 *@par Outputs:
@@ -685,10 +1031,12 @@ REG_OP(DynamicGRUV2Grad)
 *@li dh_prev:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dgate_h:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dnt_x:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(GRUV2HiddenGrad)
    .INPUT(weight_hidden, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(init_h, TensorType({DT_FLOAT16, DT_FLOAT}))
 REG_OP(GRUV2HiddenGradCell)
    .INPUT(dh_pre_t, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(dy, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(dh, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -699,8 +1047,197 @@ REG_OP(GRUV2HiddenGrad)
    .OUTPUT(dh_prev, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(dgate_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(dnt_x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(t_state, Int, 0)
    .ATTR(gate_order, String, "zrh")
    .OP_END_FACTORY_REG(GRUV2HiddenGrad)
    .OP_END_FACTORY_REG(GRUV2HiddenGradCell)

 /**
 * @brief Calculates the reversed outputs of the function "embedding". \n

 * @par Inputs:
 * Two inputs, including:
 * @li grad: A mutable Tensor of word grad. Must be one of the following types:
 *     float32.
 * @li indices: A mutable word index Tensor of the int32 type.\n

 * @par Attributes:
 * @li num_weights: An int attr which use to judge how many words in dict. \n

 * @li padding_idx: An int attr judge which word to fill zeros. Defaults to "-1". \n

 * @li scale_grad_by_freq: An optional bool. Defaults to "False".
 *     If "True", "grad_weight" will be scale by word_frequency.
 *     If "False", "grad_weight" will not be scale by word_frequency. \n

 * @par Outputs:
 * @li grad_weight: A mutable output Tensor of new word grad has the same type as "grads". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator EmbeddingDenseGrad.
 */
 REG_OP(EmbeddingDenseGrad)
    .INPUT(grad, TensorType({ DT_FLOAT32 }))  /* "First operand." */
    .INPUT(indices, TensorType({ DT_INT32 })) /* "Second operand." */
    .OUTPUT(y, TensorType({ DT_FLOAT32 }))    /* "Result, has same element type as two inputs" */
    .REQUIRED_ATTR(num_weights, Int)
    .ATTR(padding_idx, Int, -1)
    .ATTR(scale_grad_by_freq, Bool, false)
    .OP_END_FACTORY_REG(EmbeddingDenseGrad)

 /**
 *@brief CommonLSTM calculation.
 *@par Inputs:
 *eight inputs: \n
 *@li x:Each time step is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li w:Each direction is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li r:Each direction is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM.
 *@li b:An optional input. Each direction is a 1D Tensor. Must be one of the following types: float16, float32. The format must be ND.
 *@li sequence_lens:An optional input. A 1D Tensor.Must be one of the following types: int32. The format must be ND.
 *@li initial_h:An optional input. Each direction is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li initial_c:An optional input. Each direction is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li p:An optional input. Each direction is a 1D Tensor.Must be one of the following types: float16, float32. The format must be ND.

 *@par Attributes:
 *@li activation_alpha:Optional scaling values used by some activation functions. Empty is currently supported.
 *@li activation_beta:Optional scaling values used by some activation functions. Empty is currently supported.
 *@li activations:The list of activation functions. Empty is currently supported.
 *@li clip:An float identifying the cell clip in the op. Default to -1.
 *@li direction:Specify if the RNN is forward, reverse, or bidirectional. Must be one of forward(default), reverse, or bidirectional.
 *@li hidden_size:Number of neurons in the hidden layer. Reserved.
 *@li input_forget:Couple the input and forget gates if 1. Reserved.

 *@par Outputs:
 *three outputs: \n
 *@li y:First dimension is time step, second dimension is direction, others is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li y_h:Each direction is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li y_c:Each direction is a 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 */

 REG_OP(CommonLSTM)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(r, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(sequence_lens, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(initial_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(initial_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(p, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y_c, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(activation_alpha, ListFloat, {})
    .ATTR(activation_beta, ListFloat, {})
    .ATTR(activations, ListString, {})
    .ATTR(clip, Float, -1.0)
    .ATTR(direction, String, "forward")
    .REQUIRED_ATTR(hidden_size, Int)
    .ATTR(input_forget, Int, 0)
    .OP_END_FACTORY_REG(CommonLSTM)

 /**
 * @brief Calculate the mask. According to hidden_size and num_step, convert seq_length to mask.
 *
 * @par Inputs:
 * @li seq_length: A 1D Tensor. Must be one of the following types: int32. Record the current length of each batch. [batch_size].
 * @li b: A 1D Tensor. Must be one of the following types: fp16/fp32. Record the hidden_size. [4 * hidden_size].
 * @li x: A 3D Tensor. Must be one of the following types: fp16/fp32. Record the num_step/batch_size/input_size. [num_step, batch_size, input_size].
 *
 * @par Outputs:
 * seq_mask: A 3D Tensor. Must be one of the following types: fp16/fp32. with the shape of [num_step, batch_size, hidden_size]. And has the same type as "b" \n
 *
 * @par Restrictions:
 * Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(RnnGenMaskV2)
    .INPUT(seq_length, TensorType({DT_INT32}))
    .INPUT(b, TensorType({{DT_FLOAT16, DT_FLOAT}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(seq_mask, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(RnnGenMaskV2)

 /**
 * @brief Common GRU calculation.

 * @par Inputs:
 * Eight inputs, including:
 * @li x: The input sequences packed (and pontentially padded) into on 3D Tesnor(float16). The format must be FRACTAL_NZ 
 * @li w: The weight tensor for the gates is 3D Tensor(float16). The format must be FRACTAL_Z
 * @li r: The recurrence weight tesnor is 3D Tensor(float16). The format must be FRACTAL_Z
 * @li b: The bias tensor for the gates. The format must be ND
 * @li sequence_lens: Optional tensor specifying lengths of sequences(int32). The format must be ND
 * @li init_h: Optional initial value of the hidden(float16,float32). The format must be FRACTAL_NZ

 * @par Attributes:
 * @li activation_alpha: Optional scaling values used by some activation functions.  \n
 * @li activation_beta: Optional scaling values used by some activation functions.  \n
 * @li activations: A list of 2 (or 4 if bidirectional) activation functions for update, reset, and hidden gates.  \n
 * @li clip: Cell clip threshold. \n
 * @li direction: Specify if the RNN is forward, reverse, or bidirectional. \n
 * @li hidden_size: Number of neurons in the hidden layer. \n
 * @li linear_before_reset: When computing the output of the hidden gate, apply the linear transformation before multiplying by the output of the reset gate. \n

 * @par Outputs:
 * @li y: A Tensor that concats all the intermediate output values of the hidden(float16,float32). The format must be FRACTAL_NZ
 * @li y_h: The last output value of the hidden(float16,float32). The format must be FRACTAL_NZ
 */
 REG_OP(CommonGRU)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(w, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(r, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OPTIONAL_INPUT(sequence_lens, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(initial_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OUTPUT(y_h, TensorType({DT_FLOAT16, DT_FLOAT}))
    .ATTR(activation_alpha, ListFloat, {})
    .ATTR(activation_beta , ListFloat, {})
    .ATTR(activations , ListString, {})
    .ATTR(clip, Float, -1.0)
    .ATTR(direction, String, "forward")
    .REQUIRED_ATTR(hidden_size, Int)
    .ATTR(linear_before_reset , Int, 0)
    .OP_END_FACTORY_REG(CommonGRU)
 /**
 * @brief Calculates the reversed outputs of the function "embedding". \n

 * @par Inputs:
 * Four inputs, including:
 * @li weight: A mutable Tensor of word grad. Must be one of the following types:
 *     float32.
 * @li indices: A mutable word index Tensor of the int32 type.\n
 * @li offsets: A mutable word index Tensor of the int32 type.\n
 * @li per_sample_weights: to indicate all weights should be taken to be 1.
 *     If specified, per_sample_weights must have exactly the same shape as input
 *     and is treated as having the same offsets, if those are not None.
 *     Only supported for mode='sum'..\n

 * @par Attributes:
 * @li mode: An string attr which use "sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.. \n

 * @li scale_grad_by_freq: An optional bool. Defaults to "False".
 *     If "True", "grad_weight" will be scale by word_frequency.
 *     If "False", "grad_weight" will not be scale by word_frequency. \n
 * @li sparse: if True, gradient w.r.t.attr weight matrix will be a sparse tensor. \n
 * @li include_last_offset: if True, attr offsets  has one additional element, where the last element
 *     is equivalent to the size of indices. This matches the CSR format.. \n

 * @par Outputs:
 * @li grad_weight: A mutable output Tensor of new word grad has the same type as "grads". \n

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator EmbeddingBag.
 */
 REG_OP(EmbeddingBag)
    .INPUT(weight, TensorType({ DT_FLOAT32 }))
    .INPUT(indices, TensorType({ DT_INT32 }))
    .OPTIONAL_INPUT(offsets, TensorType({DT_INT32}))
    .OPTIONAL_INPUT(per_sample_weights, TensorType({DT_FLOAT32}))
    .OUTPUT(y, TensorType({ DT_FLOAT32 }))
    .ATTR(mode, String, "mean")
    .ATTR(scale_grad_by_freq, Bool, false)
    .ATTR(sparse, Bool, false)
    .ATTR(include_last_offset, Bool, false)
    .OP_END_FACTORY_REG(EmbeddingBag)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_RNN_H_
--- a/third_party/fwkacllib/inc/ops/rpn_ops.h
+++ b/third_party/fwkacllib/inc/ops/rpn_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/save_ops.h
+++ b/third_party/fwkacllib/inc/ops/save_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/sdca_ops.h
+++ b/third_party/fwkacllib/inc/ops/sdca_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -239,6 +239,30 @@ REG_OP(GatherV2D)
    .REQUIRED_ATTR(axis, Int)
    .OP_END_FACTORY_REG(GatherV2D)

 /**
 *@Gathers values along an axis specified by dim . \n

 *@par Inputs:
 *@li x: A Tensor. Must be one of the following types: float16, float32, int32, int64.
 *@li index: A Tensor. Must be one of the following types: int64 . \n

 *@par Attributes:
 * dim: the axis along which to index . \n

 *@par Outputs:
 * y: A Tensor. Has the same type as "x" . \n

 *@par Third-party framework compatibility
 *Compatible with the PyTorch operator Gather.
 */

 REG_OP(GatherElements)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT64}))
    .INPUT(index, TensorType({DT_INT64}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT64}))
    .ATTR(dim, Int, 0)
    .OP_END_FACTORY_REG(GatherElements)

 /**
 *@brief Extracts a strided slice of a tensor. Roughly speaking, this op
    extracts a slice of size (end-begin)/stride from the given input tensor.
@@ -275,8 +299,6 @@ REG_OP(GatherV2D)
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n

 *@attention Constraints:

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSlice.
 */
@@ -327,8 +349,6 @@ REG_OP(StridedSlice)
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n

 *@attention Constraints:

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSlice.

@@ -385,8 +405,6 @@ REG_OP(StridedSliceD)
 *@par Outputs:
 *output: A Tensor. Has the same type as "dy" . \n

 *@attention Constraints:

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSliceGradD.

@@ -444,8 +462,6 @@ REG_OP(StridedSliceGradD)
 *@par Outputs:
 *output: A Tensor has the same type as "dy" . \n

 *@attention Constraints:

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSliceGrad.
 */
@@ -486,6 +502,38 @@ REG_OP(UnsortedSegmentSum)
    .OUTPUT(y, TensorType::NumberType())
    .OP_END_FACTORY_REG(UnsortedSegmentSum)

 /**
 *@brief Creates a one-dimensional tensor of size steps whose values are evenly spaced from start to 
 *	end, inclusive, on a logarithmic scale with base base. \n

 *@par Inputs:
 *One inputs, including:
 * @li assist: A tensor. Must be one of the following types:
 *     float16, float32. \n

 * @par Attributes:
 * @li start: An required float. Used to select the start. \n
 * @li end: An required float. Used to select the end. \n
 * @li steps: An optional int.Defaults to 100. \n
 * @li base: An optional float.Defaults to 10.0. \n
 * @li dtype: An optional int.Defaults to 1. \n

 *@par Outputs:
 *y: A Tensor with the same type and shape of input_x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator logspaced. \n
 */
 REG_OP(LogSpaceD)
    .INPUT(assist, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR (start, Float)
    .REQUIRED_ATTR (end, Float)
    .ATTR(steps, Int, 100)
    .ATTR(base, Float, 10.0)
    .ATTR(dtype, Int, 1)
    .OP_END_FACTORY_REG(LogSpaceD)

 /**
 *@brief Computes the sum along segments of a tensor . \n

@@ -796,6 +844,34 @@ REG_OP(SliceD)
    .REQUIRED_ATTR(size, ListInt)
    .OP_END_FACTORY_REG(SliceD)

 /**
 *@brief Extracts a slice from a tensor.
 *       This operation extracts a slice of size "size" from a tensor "x"
 *		starting at the location specified by "begin" . \n

 *@par Inputs:
 *@li x: A Tensor. Must be one of the following types:
 * float16, float32, double, int64, int32, uint8, uint16, uint32, uint64, int8,
 * int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32 . \n

 *@par Inputs:
 *@li offsets: The starting location for the slice.

 *@par Attributes:
 *@li size: The tensor shape . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x". The slice extracted from the tensor.
 *@par Restrictions:
 *Warning: THIS FUNCTION IS DEPRECATED. Please use Slice instead.
 */
 REG_OP(SliceDV2)
    .INPUT(x, TensorType::BasicType())
    .INPUT(offsets, TensorType::IndexNumberType())
    .OUTPUT(y, TensorType::BasicType())
    .REQUIRED_ATTR(size, ListInt)
    .OP_END_FACTORY_REG(SliceDV2)
    
 /**
 * @brief Finds values and indices of the "k" largest elements for the last
 * dimension . \n
@@ -829,8 +905,8 @@ REG_OP(SliceD)
 * @li sorted = true
 * @li It's unstable sorted indices on the platform of Ascend310

 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator TopK.
 * @par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use TopKV2 instead.
 */
 REG_OP(TopKD)
    .INPUT(x, TensorType::RealNumberType())
@@ -855,6 +931,44 @@ REG_OP(TopKD)
 * Number of top elements to look for along the last dimension (along each row
 * for matrices) . \n

 * @par Attributes:
 * @li sorted: An optional bool. Defaults to true.
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li dim: An optional int. Defaults to -1. For reserved use.
 * @li largest: An optional bool. Defaults to true. For reserved use. \n

 * @par Outputs:
 * @li values: A Tensor, specifying the sorted data. Has the same type as
 * "input".
 * @li indices: A Tensor of type int32, specifying the indices of sorted data . \n

 * @see TopK()
 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator TopKV2.
 */
 REG_OP(TopKV2)
    .INPUT(x, TensorType::RealNumberType())
    .INPUT(k, TensorType({DT_INT32}))
    .OUTPUT(values, TensorType::RealNumberType())
    .OUTPUT(indices, TensorType({DT_INT32}))
    .ATTR(sorted, Bool, true)
    .ATTR(dim, Int, -1)
    .ATTR(largest, Bool, true)
    .OP_END_FACTORY_REG(TopKV2)

 /**
 * @brief Finds values and indices of the "k" largest elements for the last
 * dimension . \n

 * @par Inputs:
 * Two inputs, including:
 * @li x: A 1D or higher tensor of type BasicType, with the last dimension
 * at least "k".
 * @li k: A 0D Tensor of type int32.
 * Number of top elements to look for along the last dimension (along each row
 * for matrices) . \n

 * @par Attributes:
 * @li sorted: An optional bool. Defaults to true.
 * If true, the resulting "k" elements will be sorted by the values in descending
@@ -876,15 +990,17 @@ REG_OP(TopK)
    .OUTPUT(values, TensorType::RealNumberType())
    .OUTPUT(indices, TensorType({DT_INT32}))
    .ATTR(sorted, Bool, true)
    .ATTR(largest, Bool, true)
    .ATTR(dim, Int, -1)
    .OP_END_FACTORY_REG(TopK)
 /**
 *@brief Creates a new tensor by applying sparse "updates" to individual values or slices within a tensor (initially zero for numeric, empty for string) of the given "shape" according to "indices" . \n

 *@par Inputs:
 *Inputs including:
 * @li indices: A required index tensor. Must be one of the following types: float32, float16, int32, int8, uint8.
 * @li x: A required slice tensor. Must be one of the following types: float32, float16, int32, int8, uint8.
 * @li shape: A required list of int32, specifying the output shape.
 * @li indices: A required index tensor. Must be one of the following types: int32 or int64.
 * @li x: A required slice tensor. Must be one of the following types: float32, float16, int32, int8, uint8...
 * @li shape: A required list of int32 or int64, specifying the output shape.
 *@par Outputs:
 *y:A output Tensor with same datatype as "updates" . \n

@@ -895,7 +1011,7 @@ REG_OP(TopK)
 * Compatible with the TensorFlow operator ScatterNd.
 */
 REG_OP(ScatterNd)
    .INPUT(indices, TensorType::BasicType())
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(x, TensorType::BasicType())
    .INPUT(shape, TensorType::IndexNumberType())
    .OUTPUT(y, TensorType::BasicType())
@@ -908,11 +1024,11 @@ REG_OP(ScatterNd)
 *@par Inputs:
 *Inputs including:
 * @li indices: A required index tensor. Must be one of the following types:
 * float, float16, int32, int16. format:ND.
 * int32 or int64. format:ND.
 * @li x: A required slice tensor. Must be one of the following types:
 * float, float16, int32, int16. format:ND.
 * float16, float, int32, int8, uint8. format:ND.
 *@par Attributes:
 * @li shape: A required list of int32, specifying the output shape.
 * @li shape: A required list of int32 or int64, specifying the output shape.
 *@par Outputs:
 *y: A Tensor. Has the same type as "x". format:ND . \n

@@ -927,8 +1043,8 @@ REG_OP(ScatterNd)
 */
 REG_OP(ScatterNdD)
    .INPUT(indices, TensorType::IndexNumberType())
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT16}))
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32, DT_INT8, DT_UINT8}))
    .REQUIRED_ATTR(shape, ListInt)
    .OP_END_FACTORY_REG(ScatterNdD)

@@ -1752,6 +1868,33 @@ REG_OP(Crop)
      .REQUIRED_ATTR(offsets, ListInt)
      .OP_END_FACTORY_REG(Crop)

 /**
 *@brief Returns a namedtuple (values, indices) where values is the cumulative 
 * the cumulative minimum of elements of input in the dimension dim. 
 * And indices is the index location of each maximum value found in the dimension dim. \n

 *@par Inputs:
 *One inputs, including:
 * @li x: A tensor . Must be one of the following types:
 *     float16, float32, int32, uint32, int8, uint8. \n

 *@par Attributes:
 * @li axis: Axis along which to cummin. \n

 *@par Outputs:
 * y: A Tensor with the same type and shape of x's. \n
 * indices: A Tensor with the int32 type and the same shape of x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator Cummin. \n
 */
 REG_OP(Cummin)
    .INPUT(x, TensorType::BasicType())
    .OUTPUT(y, TensorType::BasicType())
    .OUTPUT(indices, TensorType::BasicType())
    .REQUIRED_ATTR(axis, Int)
    .OP_END_FACTORY_REG(Cummin)

 /**
 *@brief Extends the input with copies of data along a specified dimension. For example:
 *(1) If x = [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], with shape (2, 3, 2);
@@ -1921,6 +2064,249 @@ REG_OP(CumulativeLogsumexpD)
    .ATTR(exclusive, Bool, false)
    .ATTR(reverse, Bool, false)
    .OP_END_FACTORY_REG(CumulativeLogsumexpD)

 /**
 * @brief Add updates to var according to axis and indices.

 * @par Inputs:
 * Three inputs, including:
 * @li var: A Tensor. Must be one of the following types:
 *     float16, float32, int16, int32, int8, uint8.
 * @li indices: A Tensor of the indices, type should be int32.
 * @li updates: A Tensor of the same type as "var". \n

 * @par Attributes:
 * @li axis: An required int to specify the axis to perform indices add. \n

 * @par Outputs:
 * @li var: A Tensor. Same as input "var".

 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator index_add_.
 */
 REG_OP(InplaceIndexAdd)
    .INPUT(var, TensorType({DT_INT16, DT_INT32, DT_INT8,
                            DT_UINT8, DT_FLOAT32, DT_FLOAT16}))
    .INPUT(indices, TensorType({DT_INT32}))
    .INPUT(updates, TensorType({DT_INT16, DT_INT32, DT_INT8,
                                DT_UINT8, DT_FLOAT32, DT_FLOAT16}))
    .OUTPUT(var, TensorType({DT_INT16, DT_INT32, DT_INT8,
                            DT_UINT8, DT_FLOAT32, DT_FLOAT16}))
    .REQUIRED_ATTR(axis, Int)
    .OP_END_FACTORY_REG(InplaceIndexAdd)

 /**
 * @brief Replace the value of X with value according to mask.
 * @par Inputs:
 * three inputs, including:
 *  @li x: A Tensor of dtype is float16 or float32 or int64 or int32 or int8.
 *  @li mask: A Tensor of dtype bool.
 *  @li value: A Tensor of dtype float16 or float32 or int64 or int32 or int8.

 * @par Outputs:
 *  @li y: A tensor. Must be one of the following dtypes:
 *   float16, float32, int64, int32, int8.
 */
 REG_OP(MaskedFill)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT32, DT_INT64}))
    .INPUT(mask, TensorType({DT_BOOL}))
    .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT32, DT_INT64}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT32, DT_INT64}))
    .OP_END_FACTORY_REG(MaskedFill)

 /**
 * @brief Choose the value of X with value according to mask.

 * @par Inputs:
 * two inputs, including:
 *  @li x: A Tensor of dtype is float16 or float32.
 *  @li mask: A Tensor of dtype is bool. \n

 * @par Outputs:
 *  @li y: A tensor with the same type as x. \n

 * @par Third-party framework compatibility
 * Compatible with the Numpy operator select.
 * Replaces the pytorch operator masked_select in some scenarios.\n
 */
 REG_OP(MaskedSelectV2)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
    .INPUT(mask, TensorType({DT_BOOL}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
    .OP_END_FACTORY_REG(MaskedSelectV2)

 /**
 * @brief Slice a tensor at its last dim, e.x. a[..., begin:end:stride]. \n

 * @par Inputs:
 * One inputs, including:
 * @li x: A Tensor. Must be one of the following types: float16, float32, int16, int32.

 * @par Attributes:
 * @li start: An  attribute of type Int, start index of last dim. \n
 * @li end: An  attribute of type Int, end index of last dim. \n
 * @li stride: An  attribute of type Int, stride of slice. \n

 * @par Outputs:
 * @li y: A Tensor. Has the same type as "x". \n

 * @par Third-party framework compatibility
 * No compatibility
 */
 REG_OP(SliceLastDim)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8, DT_INT16, DT_INT32, DT_INT64}))
    .REQUIRED_ATTR(start, Int)
    .REQUIRED_ATTR(end, Int)
    .ATTR(stride, Int, 1)
    .OP_END_FACTORY_REG(SliceLastDim)

 /**
 * @brief Extracts a strided slice of a tensor. Roughly speaking, this op \n
 *   extracts a slice of size (end-begin)/stride from the given input tensor. \n
 *   Starting at the location specified by begin the slice continues by \n
 *   adding stride to the index until all dimensions are not less than end. \n
 *
 * @par Inputs:
 * Four inputs, including:
 * @li x: A Tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, \n
 *     complex64, int64, qint8, quint8, qint32, qint16, quint16, uint16, \n
 *     complex128, float16, uint32, uint64, complex64, complex128. \n
 * @li begin: A Tensor of type int32 or int64, for the index of the first value to select.
 *
 * @li end: A Tensor of type int32 or int64, for the index of the last value to select.
 *
 * @li axes: A Tensor of type int32 or int64, indicate axis to be select.
 *
 * @li strides: A Tensor of type int32 or int64, for the increment.
 *
 * @par Attributes:
 * @li begin_mask: A Tensor of type int32. \n
 *     A bitmask where a bit "i" being "1" means to ignore the begin \n
 *     value and instead use the largest interval possible.
 * @li end_mask: A Tensor of type int32. \n
 *     Analogous to "begin_mask".
 * @li ellipsis_mask: A Tensor of type int32. \n
 *     A bitmask where bit "i" being "1" means the "i"th position \n
 *     is actually an ellipsis.
 * @li new_axis_mask: A Tensor of type int32. \n
 *     A bitmask where bit "i" being "1" means the "i"th \n
 *     specification creates a new shape 1 dimension.
 * @li shrink_axis_mask: A Tensor of type int32. \n
 *     A bitmask where bit "i" implies that the "i"th \n
 *     specification should shrink the dimensionality.
 *
 * @par Outputs:
 * y: A Tensor. Has the same type as "x".
 *
 * @attention Constraints:
 *
 * @par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSliceV2.
 */
 REG_OP(StridedSliceV2)
    .INPUT(x, TensorType::BasicType())
    .INPUT(begin, TensorType::IndexNumberType())
    .INPUT(end, TensorType::IndexNumberType())
    .OPTIONAL_INPUT(axes, TensorType::IndexNumberType())
    .OPTIONAL_INPUT(strides, TensorType::IndexNumberType())
    .ATTR(begin_mask, Int, 0)
    .ATTR(end_mask, Int, 0)
    .ATTR(ellipsis_mask, Int, 0)
    .ATTR(new_axis_mask, Int, 0)
    .ATTR(shrink_axis_mask, Int, 0)
    .OUTPUT(y, TensorType::BasicType())
    .OP_END_FACTORY_REG(StridedSliceV2)

 /**
 *@brief Fills the elements of the input tensor with value val by selecting the indices in the order given in index. \n

 *@par Inputs:
 *Three inputs, including:
 * @li x: A tensor. Must be one of the following types:
 *     float16, float32, int32. \n
 *@li assist1: A tensor. Must be one of the following types:
 *     float16, float32, int32. \n
 *@li assist2: A tensor. Must be one of the following types:
 *     float16, float32, int32. \n

 * @par Attributes:
 * @li dim: A required int. Used to select the dimension of this tensor. \n

 *@par Outputs:
 *y: A Tensor with the same type and shape of input_x's. \n

 *@par Third-party framework compatibility
 *Compatible with the Pytorch operator IndexFill. \n
 */
 REG_OP(IndexFillD)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(assist1, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .INPUT(assist2, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
    .REQUIRED_ATTR(dim, Int)
    .OP_END_FACTORY_REG(IndexFillD)

 /**
 * @brief For each row r of this and for each column c, do (*this)(r, c) += src(j, c), \n
 *   where j ranges from indexes[r].first through indexes[r].second - 1. \n
 *   In general indexes must be >= 0 and < src.NumRows(); \n
 *   but to represent an empty range you may use the pair (-1, -1) or any pair of numbers (i, j) such that i >= j. \n

 * @par Inputs:
 * Three inputs, including:
 * @li x: A Tensor. Must be one of the following types:
 *     float16, float32.
 * @li indices: A Tensor of the indices, type should be int32.
 * @li src: A Tensor of the same type as "x". \n

 * @par Outputs:
 * @li x: A Tensor. Same as input "x".

 * @par Third-party framework compatibility
 * Compatible with the kaldi operator AddRowRanges.
 */
 REG_OP(AddRowRanges)
    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(src, TensorType({DT_FLOAT16,DT_FLOAT}))
    .INPUT(indices, TensorType({DT_INT32}))
    .OUTPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
    .OP_END_FACTORY_REG(AddRowRanges)

 /**
 *@brief masked fill tensor along with one axis by range.
 * boxes. It is a customized masked fill range operator . \n

 *@par Inputs:
 * Four inputs, including:
 *@li x: input tensor. A ND Tensor of float32/float16/int32/int8 with shapes
 * 1-D (D,), 2-D(N, D), 3-D(N, C, D)
 *@li start: masked fill start pos. A 3D Tensor of int32 with
 * shape (num, N). "num" indicates the number of loop masked fill, and the value N
 * indicates the batch of ND Tensor, if input x shape is 1-D, N = 1. \n
 *@li end: masked fill end pos. A 3D Tensor of int32 with
 * shape (num, N). "num" indicates the number of loop masked fill, and the value N
 * indicates the batch of ND Tensor. \n
 *@li value: masked fill value. A 2D Tensor of float32/float16/int32/int8 with
 * shape (num,). "num" indicates the number of loop masked fill

 *@par Attributes:
 *@li axis: axis with masked fill of int32. Defaults to -1.

 *@par Outputs:
 *y: A ND Tensor of float32/float16/int32/int8 with shapes 1-D (D,), 2-D(N, D), 3-D(N, C, D)

 * @par Restrictions:
 * Warning: input shape's length must not be bigger than 1024 * 1024 * 1024.
 */
 REG_OP(MaskedFillRange)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32}))
    .INPUT(start, TensorType({DT_INT32}))
    .INPUT(end, TensorType({DT_INT32}))
    .INPUT(value, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32}))
    .REQUIRED_ATTR(axis, Int)
    .OP_END_FACTORY_REG(MaskedFillRange)
 } // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_SELECTION_OPS_H_
--- a/third_party/fwkacllib/inc/ops/set_ops.h
+++ b/third_party/fwkacllib/inc/ops/set_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/sparse_ops.h
+++ b/third_party/fwkacllib/inc/ops/sparse_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -383,11 +383,11 @@ REG_OP(SparseFillEmptyRowsGrad)
 REG_OP(SparseTensorDenseMatMul)
    .INPUT(x1_indices, TensorType({DT_INT32, DT_INT64}))
    .INPUT(x1_values, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, \
        DT_COMPLEXT64, DT_COMPLEX128, DT_FLOAT16}))
        DT_COMPLEXT64, DT_COMPLEX128, DT_FLOAT16, DT_INT64}))
    .INPUT(x1_shape, TensorType({DT_INT64}))
    .INPUT(x2, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_COMPLEXT64, \
    .INPUT(x2, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_COMPLEXT64, \
        DT_COMPLEX128, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT32, DT_COMPLEXT64, \
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_COMPLEXT64, \
        DT_COMPLEX128, DT_FLOAT16}))
    .ATTR(adjoint_a, Bool, false)
    .ATTR(adjoint_b, Bool, false)
--- a/third_party/fwkacllib/inc/ops/spectral_ops.h
+++ b/third_party/fwkacllib/inc/ops/spectral_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,6 +26,24 @@

 namespace ge {

 /**
 *@brief Computes the inverse 1-dimensional discrete Fourier transform over the
 inner-most dimension of `x`. \n

 *@par Inputs:
 *@li x: A Tensor. Must be the following types: complex64, complex128. \n

 *@par Outputs:
 *@li y: A complex tensor of the same rank as `x`. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow IFFT operator.
 */
 REG_OP(IFFT)
    .INPUT(x, TensorType({DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_COMPLEX64,DT_COMPLEX128}))
    .OP_END_FACTORY_REG(IFFT)

 /**
 *@brief Real-valued fast Fourier transform . \n

@@ -47,6 +65,84 @@ REG_OP(RFFT)
    .OUTPUT(y, TensorType({DT_COMPLEX64}))
    .OP_END_FACTORY_REG(RFFT)

 /**
 *@brief Inverse real-valued fast Fourier transform. \n

 *@par Inputs:
 *@li x: A complex64 tensor.
 *@li fft_length: An int32 tensor of shape [1]. The FFT length. \n

 *@par Outputs:
 *@li y: A float32 tensor of the same rank as `input`. The inner-most
  dimension of `input` is replaced with the `fft_length` samples of its inverse
  1D Fourier transform. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow IRFFT operator.
 */
 REG_OP(IRFFT)
    .INPUT(x, TensorType({DT_COMPLEX64}))
    .INPUT(fft_length, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT}))
    .OP_END_FACTORY_REG(IRFFT)


 /**
 *@brief 2D fast Fourier transform. \n

 *@par Inputs:
 *@li x: A complex64 tensor.

 *@par Outputs:
 *@li y: A complex64 tensor of the same shape as `input`. The inner-most 2
  dimensions of `input` are replaced with their 2D Fourier transform. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow FFT2D operator.
 */
 REG_OP(FFT2D)
    .INPUT(x, TensorType({DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_COMPLEX64, DT_COMPLEX128}))
    .OP_END_FACTORY_REG(FFT2D)

 /**
 *@brief Calculate the one-dimensional discrete Fourier transform on the
 innermost dimension of the input. \n

 *@par Inputs:
 *@li x: A Tensor. Must be the following types: complex64, complex128. \n

 *@par Outputs:
 *@li y: A complex tensor with the same shape as input. The innermost dimension
 of the input is replaced by its 1-dimensional Fourier transform. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow FFT operator.
 */
 REG_OP(FFT)
    .INPUT(x, TensorType({DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_COMPLEX64,DT_COMPLEX128}))
    .OP_END_FACTORY_REG(FFT)

 /**
 *@brief Calculate the inverse 1-dimensional discrete Fourier transform on the
 innermost dimension of the input. \n

 *@par Inputs:
 *@li x: A Tensor. Must be the following types: complex64, complex128. \n

 *@par Outputs:
 *@li y: A complex tensor with the same shape as input. The innermost dimension
 of the input is replaced by its inverse two-dimensional Fourier transform. \n

 *@par Third-party framework compatibility
 * Compatible with TensorFlow IFFT2D operator.
 */
 REG_OP(IFFT2D)
    .INPUT(x, TensorType({DT_COMPLEX64,DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_COMPLEX64,DT_COMPLEX128}))
    .OP_END_FACTORY_REG(IFFT2D)

 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_SPECTRAL_OPS_H_
--- a/third_party/fwkacllib/inc/ops/split_combination_ops.h
+++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,8 +62,8 @@ REG_OP(Split)
 *Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64

 *@par Attributes:
 *@li split_dim: A required int8, int16, int32, or int64. Specifies the dimension along which to split. No default value.
 *@li num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value . \n
 *@li split_dim: A required int32. Specifies the dimension along which to split. No default value.
 *@li num_split: A required int32. Specifies the number of output tensors. No default value . \n

 *@par Outputs:
 *y:Dynamic output. A list of output tensors. Has the same type and format as "x" . \n
@@ -94,12 +94,12 @@ REG_OP(SplitD)
 *@par Inputs:
 * Three inputs, including:
 *@li x: An ND Tensor.
 *Must be one of the following types:
 *@li size_splits: A list of int8, int16, int32, or int64. Specifies a list containing the sizes of each output tensor along the split dimension.
 *@li split_dim: An int8, int16, int32, or int64. Specifies the dimension along which to split . \n
 *Must be one of the types:float16, float32, double, int64, int32, uint8, uint16, uint32, uint64, int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32.
 *@li size_splits: Must be one of the types:int32, int64. Specifies a list containing the sizes of each output tensor along the split dimension.
 *@li split_dim: Must be the following type:int32. Specifies the dimension along which to split . \n

 *@par Attributes:
 *num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value . \n
 *num_split: A required int32. Specifies the number of output tensors. No default value . \n

 *@par Outputs:
 *y:  Dynamic output.A list of output tensors. Has the same type and format as "x" . \n
@@ -129,9 +129,9 @@ REG_OP(SplitV)
 *Must be one of the following types: float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64

 *@par Attributes:
 *@li size_splits: A required list of int8, int16, int32, or int64. Specifies a list containing the sizes of each output tensor along the split dimension.
 *@li split_dim: A required int8, int16, int32, or int64. Specifies the dimension along which to split. No default value.
 *@li num_split: A required int8, int16, int32, or int64. Specifies the number of output tensors. No default value . \n
 *@li size_splits: A required list of int32. Specifies a list containing the sizes of each output tensor along the split dimension.
 *@li split_dim: A required int32. Specifies the dimension along which to split. No default value.
 *@li num_split: A required int32. Specifies the number of output tensors. No default value . \n

 *@par Outputs:
 *y: Dynamic output.A list of output tensors. Has the same type and format as "x" . \n
@@ -317,15 +317,15 @@ REG_OP(Concat)
 *     int64, uint8, uint16, uint32, uint64, float16, float32, bool . It's a dynamic input. \n

 *@par Attributes:
 *@li axis: A optional int, defaultvalue is 0.
 *@li axis: A optional int, default value is 0.
 *     Dimension along which to pack. The range is [-(R+1), R+1).
 *@li N: A required int. Number of tensors . \n

 *@par Outputs:
 *y: A Tensor. Has the same type as "x".

 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator Pack.
 It's a dynamic output.
 * Compatible with the TensorFlow operator Pack.
 */
 REG_OP(Pack)
    .DYNAMIC_INPUT(x, TensorType::BasicType())
--- a/third_party/fwkacllib/inc/ops/state_ops.h
+++ b/third_party/fwkacllib/inc/ops/state_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/stateless_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateless_random_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/string_ops.h
+++ b/third_party/fwkacllib/inc/ops/string_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,6 +25,235 @@
 #include "graph/operator_reg.h"

 namespace ge {
 /**
 *@brief Creates ngrams from ragged string data . \n

 *@par Inputs:
 include:
 *@li data:1-D.The values tensor of the ragged string tensor to make ngrams out of.
 *@li data_splits:The splits tensor of the ragged string tensor to make ngrams out of . \n

 *@par Attributes:
 * separator:The string to append between elements of the token. Use "" for no separator.
 * ngram_widths:The sizes of the ngrams to create.
 * left_pad:The string to use to pad the left side of the ngram sequence. Only used if pad_width != 0.
 * right_pad:The string to use to pad the right side of the ngram sequence. Only used if pad_width != 0.
 * pad_width:The number of padding elements to add to each side of each sequence. 
 * preserve_short_sequences: Preserve short sequences. \n

 *@par Outputs:
 *@li ngrams:The values tensor of the output ngrams ragged tensor.
 *@li ngrams_splits:The splits tensor of the output ngrams ragged tensor. \n

 *@see StringNGrams()

 *@par Third-party framework compatibility
 *compatible with StringNGrams op of tensorflow

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(StringNGrams)
    .INPUT(data, TensorType({DT_STRING}))
    .INPUT(data_splits, TensorType({DT_INT32, DT_INT64}))
    .OUTPUT(ngrams, TensorType({DT_STRING}))
    .OUTPUT(ngrams_splits, TensorType({DT_INT32, DT_INT64}))
    .REQUIRED_ATTR(separator, String)
    .ATTR(ngram_widths, ListInt, {})
    .REQUIRED_ATTR(left_pad, String)
    .REQUIRED_ATTR(right_pad, String)
    .REQUIRED_ATTR(pad_width, Int)
    .REQUIRED_ATTR(preserve_short_sequences, Bool)
    .OP_END_FACTORY_REG(StringNGrams)

 /**
 *@brief Decodes each string in `input` into a sequence of Unicode code points . \n

 *@par Inputs:
 include:
 *@li input:The text to be decoded. Can have any shape. Note that the output is flattened
 to a vector of char values. \n

 *@par Attributes:
 * input_encoding:Text encoding of the input strings. This is any of the encodings supported
 by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 * errors:Error handling policy when there is invalid formatting found in the input.
 The value of 'strict' will cause the operation to produce a InvalidArgument
 error on any invalid input formatting. A value of 'replace' (the default) will
 cause the operation to replace any invalid formatting in the input with the
 `replacement_char` codepoint. A value of 'ignore' will cause the operation to
 skip any invalid formatting in the input and produce no corresponding output
 character.
 * replacement_char:The replacement character codepoint to be used in place of any invalid
 formatting in the input when `errors='replace'`. Any valid unicode codepoint may
 be used. The default value is the default unicode replacement character is
 0xFFFD or U+65533.
 * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
 `replacement_char`. Default is false. \n

 *@par Outputs:
 *@li row_splits:A 1D tensor containing the row splits.
 *@li char_values:A 1D tensor containing the decoded codepoints.
 *@li char_to_byte_starts:A 1D int32 Tensor containing the byte index in the input string where each
 character in `char_values` starts. \n

 *@see UnicodeDecodeWithOffsets()

 *@par Third-party framework compatibility
 *compatible with UnicodeDecodeWithOffsets op of tensorflow

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UnicodeDecodeWithOffsets)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(row_splits, TensorType({DT_INT64}))
    .OUTPUT(char_values, TensorType({DT_INT32}))
    .OUTPUT(char_to_byte_starts, TensorType({DT_INT64}))
    .REQUIRED_ATTR(input_encoding, String)
    .ATTR(errors, String, "replace")
    .ATTR(replacement_char, Int, 65533)
    .ATTR(replace_control_characters, Bool, false)
    .ATTR(Tsplits, Type, DT_INT64)
    .OP_END_FACTORY_REG(UnicodeDecodeWithOffsets)

 /**
 *@brief Decodes each string in `input` into a sequence of Unicode code points. \n

 *@par Inputs:
 include:
 *@li input:The text to be decoded. Can have any shape. Note that the output is flattened
 to a vector of char values. \n

 *@par Attributes:
 * input_encoding:Text encoding of the input strings. This is any of the encodings supported
 by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 * errors:Error handling policy when there is invalid formatting found in the input.
 The value of 'strict' will cause the operation to produce a InvalidArgument
 error on any invalid input formatting. A value of 'replace' (the default) will
 cause the operation to replace any invalid formatting in the input with the
 `replacement_char` codepoint. A value of 'ignore' will cause the operation to
 skip any invalid formatting in the input and produce no corresponding output
 character.
 * replacement_char:The replacement character codepoint to be used in place of any invalid
 formatting in the input when `errors='replace'`. Any valid unicode codepoint may
 be used. The default value is the default unicode replacement character is
 0xFFFD or U+65533.
 * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
 `replacement_char`. Default is false. \n

 *@par Outputs:
 *@li row_splits:A 1D tensor containing the row splits.
 *@li char_values:A 1D tensor containing the decoded codepoints. \n

 *@see UnicodeDecode()

 *@par Third-party framework compatibility
 *compatible with UnicodeDecode op of tensorflow

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UnicodeDecode)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(row_splits, TensorType({DT_INT64}))
    .OUTPUT(char_values, TensorType({DT_INT32}))
    .REQUIRED_ATTR(input_encoding, String)
    .ATTR(errors, String, "replace")
    .ATTR(replacement_char, Int, 65533)
    .ATTR(replace_control_characters, Bool, false)
    .ATTR(Tsplits, Type, DT_INT64)
    .OP_END_FACTORY_REG(UnicodeDecode)

 /**
 *@brief Transcode the input text from a source encoding to a destination encoding. \n

 *@par Inputs:
 include:
 *@li input:The text to be processed. Can have any shape. \n

 *@par Attributes:
 * input_encoding:Text encoding of the input strings. This is any of the encodings supported
 by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 * output_encoding:The unicode encoding to use in the output. Must be one of `"UTF-8", "UTF-16-BE", "UTF-32-BE"`.
 Multi-byte encodings will be big-endian.
 * errors:Error handling policy when there is invalid formatting found in the input.
 The value of 'strict' will cause the operation to produce a InvalidArgument
 error on any invalid input formatting. A value of 'replace' (the default) will
 cause the operation to replace any invalid formatting in the input with the
 `replacement_char` codepoint. A value of 'ignore' will cause the operation to
 skip any invalid formatting in the input and produce no corresponding output
 character.
 * replacement_char:The replacement character codepoint to be used in place of any invalid
 formatting in the input when `errors='replace'`. Any valid unicode codepoint may
 be used. The default value is the default unicode replacement character is
 0xFFFD or U+65533.
 * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
 `replacement_char`. Default is false. \n

 *@par Outputs:
 *@li output:A string tensor containing unicode text encoded using `output_encoding`. \n

 *@see UnicodeTranscode()

 *@par Third-party framework compatibility
 *compatible with UnicodeTranscode op of tensorflow

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UnicodeTranscode)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .REQUIRED_ATTR(input_encoding, String)
    .ATTR(output_encoding, String, "UTF-8")
    .ATTR(errors, String, "replace")
    .ATTR(replacement_char, Int, 65533)
    .ATTR(replace_control_characters, Bool, false)
    .OP_END_FACTORY_REG(UnicodeTranscode)

 /**
 *@brief Encode a tensor of ints into unicode strings. \n

 *@par Inputs:
 include:
 *@li input_values:A 1D tensor containing the unicode codepoints that should be encoded.
 *@li input_splits:A 1D tensor specifying how the unicode codepoints should be split into strings. \n

 *@par Attributes:
 * output_encoding:The unicode encoding to use in the output. Must be one of `"UTF-8", "UTF-16-BE", "UTF-32-BE"`.
 Multi-byte encodings will be big-endian.
 * errors:Error handling policy when there is invalid formatting found in the input.
 The value of 'strict' will cause the operation to produce a InvalidArgument
 error on any invalid input formatting. A value of 'replace' (the default) will
 cause the operation to replace any invalid formatting in the input with the
 `replacement_char` codepoint. A value of 'ignore' will cause the operation to
 skip any invalid formatting in the input and produce no corresponding output
 character.
 * replacement_char:The replacement character codepoint to be used in place of any invalid
 formatting in the input when `errors='replace'`. Any valid unicode codepoint may
 be used. The default value is the default unicode replacement character is
 0xFFFD or U+65533. \n

 *@par Outputs:
 *@li output:The 1-D Tensor of strings encoded from the provided unicode codepoints. \n

 *@see UnicodeEncode()

 *@par Third-party framework compatibility
 *compatible with UnicodeEncode op of tensorflow

 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UnicodeEncode)
    .INPUT(input_values, TensorType({DT_INT32}))
    .INPUT(input_splits, TensorType({DT_INT32, DT_INT64}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(errors, String, "replace")
    .ATTR(output_encoding, String, "UTF-8")
    .ATTR(replacement_char, Int, 65533)
    .OP_END_FACTORY_REG(UnicodeEncode)

 /**
 *@brief Split elements of input based on delimiter into a SparseTensor . \n
@@ -61,6 +290,116 @@ REG_OP(StringSplit)
    .ATTR(skip_empty, Bool, true)
    .OP_END_FACTORY_REG(StringSplit)

 /**
 *@brief Replaces the match of pattern in input with rewrite. \n

 *@par Inputs:
 include:
 *@li input:A Tensor of type string. The text to be processed. \n

 *@par Attributes:
 *@li pattern:A string. The regular expression to match the input.
 *@li rewrite:A string. The rewrite to be applied to the matched expression.
 *@li replace_global:An optional bool. Defaults to True. If True, the replacement is global,
 otherwise the replacement is done only on the first match.

 *@par output:
 *@li output::A Tensor of type string.
 */
 REG_OP(StaticRegexReplace)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(pattern, String, "")
    .ATTR(rewrite, String, "")
    .ATTR(replace_global, Bool, true)
    .OP_END_FACTORY_REG(StaticRegexReplace)

 /**
 *@brief The input is a string tensor of any shape. The pattern is the
 *regular expression to be matched with every element of the input tensor.
 *The boolean values (True or False) of the output tensor indicate
 *if the input matches the regex pattern provided.

 *@par Inputs:
 include:
 *@li input:A Tensor of type string. The text to be processed. \n

 *@par Attributes:
 *@li pattern:A string. The regular expression to match the input.

 *@par output:
 *@li output::A bool tensor with the same shape as `input`.
 */
 REG_OP(StaticRegexFullMatch)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_BOOL}))
    .ATTR(pattern, String, "")
    .OP_END_FACTORY_REG(StaticRegexFullMatch)

 /**
 *@brief A Tensor of type string. The input to be joined. \n

 *@par Inputs:
 include:
 *@li input:A Tensor of type string. The text to be processed. 
 *@li segment_ids:A Tensor. Must be one of the following types: int32, int64. 
 *A tensor whose shape is a prefix of data.shape. Negative segment ids are not supported.
 *@li num_segments:A Tensor. Must be one of the following types: int32, int64. A scalar. 

 *@par Attributes:
 *@li separator:An optional string. Defaults to "". The separator to use when joining.

 *@par output:
 *@li output::A Tensor of type string..
 */
 REG_OP(UnsortedSegmentJoin)
    .INPUT(input, TensorType({DT_STRING}))
    .INPUT(segment_ids, TensorType({DT_INT32,DT_INT64}))
    .INPUT(num_segments, TensorType({DT_INT32,DT_INT64}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(separator, String, "")
    .OP_END_FACTORY_REG(UnsortedSegmentJoin)

 /**
 *@brief Inputs to TensorFlow operations are outputs of another TensorFlow operation.
 *This method is used to obtain a symbolic handle that represents the computation of the input.

 *@par Inputs:
 include:
 *@li input:A Tensor of type string. The text to be processed. 

 *@par Attributes:
 *@li encoding:An optional string. Defaults to "". 

 *@par output:
 *@li output::A Tensor of type string..
 */
 REG_OP(StringLower)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(encoding, String, "")
    .OP_END_FACTORY_REG(StringLower)

 /**
 *@brief Inputs to TensorFlow operations are outputs of another TensorFlow operation.
 *This method is used to obtain a symbolic handle that represents the computation of the input.

 *@par Inputs:
 include:
 *@li input:A Tensor of type string. The text to be processed. 

 *@par Attributes:
 *@li encoding:An optional string. Defaults to "". 

 *@par output:
 *@li output::A Tensor of type string..
 */
 REG_OP(StringUpper)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(encoding, String, "")
    .OP_END_FACTORY_REG(StringUpper)

 /**
 *@brief Split elements of source based on sep into a SparseTensor . \n

@@ -488,7 +827,7 @@ include:
 */
 REG_OP(AsString)
    .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
        DT_DOUBLE, DT_BOOL}))
        DT_DOUBLE, DT_BOOL, DT_COMPLEX64, DT_COMPLEX128}))
    .OUTPUT(y, TensorType({DT_STRING}))
    .ATTR(precision, Int, -1)
    .ATTR(scientific, Bool, false)
@@ -557,6 +896,45 @@ REG_OP(DecodeBase64)
    .INPUT(x, TensorType({DT_STRING}))
    .OUTPUT(y, TensorType({DT_STRING}))
    .OP_END_FACTORY_REG(DecodeBase64)

 /**
 *@brief StringNormalization performs string operations for basic cleaning . \n

 *@par Inputs:
 *@li input: only accepts [C] or [1, C] UTF-8 strings tensor . \n

 *@par Outputs:
 *@li output: UTF-8 strings tensor after cleaning . \n

 *@par Attributes:
 *@li stopwords : list of strings (default is empty).
 *List of stop words. If not set, no word would be removed from input strings
 tensor.

 *@li is_case_sensitive : bool (default is false).
 *Boolean. Whether the identification of stop words in input strings tensor is
 case-sensitive. Default is false.

 *@li case_change_action : string (default is "NONE").
 *string enum that cases output to be lowercased/uppercases/unchanged. Valid
 values are "LOWER", "UPPER", "NONE". Default is "NONE".

 *@li local : string (default is "en_US").
 *Environment dependent string that denotes the locale according to which output
 strings needs to be upper/lowercased.Default en_US or platform specific equivalent
 as decided by the implementation . \n

 *@attention Constraints:
 *@li input can be either a 1-D or 2-D tensor, the shape of 2-D tensor must be [1, C].
 */
 REG_OP(StringNormalizer)
    .INPUT(input, TensorType({DT_STRING}))
    .OUTPUT(output, TensorType({DT_STRING}))
    .ATTR(stopwords, ListString, {})
    .ATTR(is_case_sensitive, Bool, false)
    .ATTR(case_change_action, String, "NONE")
    .ATTR(local, String, "en_US")
    .OP_END_FACTORY_REG(StringNormalizer)
 }  // namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
--- a/third_party/fwkacllib/inc/ops/swap_co_ops.h
+++ b/third_party/fwkacllib/inc/ops/swap_co_ops.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/third_party/fwkacllib/inc/ops/target_crop_and_resize.h
+++ b/third_party/fwkacllib/inc/ops/target_crop_and_resize.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.